1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6 
7 #include "TestInc.h"
8 #include "BaseTokenStreamFixture.h"
9 #include "PersianAnalyzer.h"
10 
11 using namespace Lucene;
12 
13 typedef BaseTokenStreamFixture PersianAnalyzerTest;
14 
15 /// These tests show how the combination of tokenization (breaking on zero-width
16 /// non-joiner), normalization (such as treating arabic YEH and farsi YEH the
17 /// same), and stopwords creates a light-stemming effect for verbs.
18 ///
19 /// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
20 
21 /// active present indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs1)22 TEST_F(PersianAnalyzerTest, testBehaviorVerbs1) {
23     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
24     const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
25     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
26     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
27 }
28 
29 /// active preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs2)30 TEST_F(PersianAnalyzerTest, testBehaviorVerbs2) {
31     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
32     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
33     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
34     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
35 }
36 
37 /// active imperfective preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs3)38 TEST_F(PersianAnalyzerTest, testBehaviorVerbs3) {
39     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
40     const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
41     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
42     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
43 }
44 
45 /// active future indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs4)46 TEST_F(PersianAnalyzerTest, testBehaviorVerbs4) {
47     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
48     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88,
49                              0xd8, 0xb1, 0xd8, 0xaf
50                             };
51     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
52     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
53 }
54 
55 /// active present progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs5)56 TEST_F(PersianAnalyzerTest, testBehaviorVerbs5) {
57     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
58     const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80,
59                              0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf
60                             };
61     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
62     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
63 }
64 
65 /// active preterite progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs6)66 TEST_F(PersianAnalyzerTest, testBehaviorVerbs6) {
67     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
68     const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80,
69                              0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf
70                             };
71     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
72     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
73 }
74 
75 /// active perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs7)76 TEST_F(PersianAnalyzerTest, testBehaviorVerbs7) {
77     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
78     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7,
79                              0xd8, 0xb3, 0xd8, 0xaa
80                             };
81     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
82     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
83 }
84 
85 /// active imperfective perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs8)86 TEST_F(PersianAnalyzerTest, testBehaviorVerbs8) {
87     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
88     const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf,
89                              0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
90                             };
91     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
92     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
93 }
94 
95 /// active pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs9)96 TEST_F(PersianAnalyzerTest, testBehaviorVerbs9) {
97     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
98     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88,
99                              0xd8, 0xaf
100                             };
101     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
102     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
103 }
104 
105 /// active imperfective pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs10)106 TEST_F(PersianAnalyzerTest, testBehaviorVerbs10) {
107     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
108     const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf,
109                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
110                             };
111     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
112     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
113 }
114 
115 /// active preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs11)116 TEST_F(PersianAnalyzerTest, testBehaviorVerbs11) {
117     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
118     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7,
119                              0xd8, 0xb4, 0xd8, 0xaf
120                             };
121     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
122     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
123 }
124 
125 /// active imperfective preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs12)126 TEST_F(PersianAnalyzerTest, testBehaviorVerbs12) {
127     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
128     const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf,
129                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
130                             };
131     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
132     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
133 }
134 
135 /// active pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs13)136 TEST_F(PersianAnalyzerTest, testBehaviorVerbs13) {
137     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
138     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88,
139                              0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
140                             };
141     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
142     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
143 }
144 
145 /// active imperfective pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs14)146 TEST_F(PersianAnalyzerTest, testBehaviorVerbs14) {
147     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
148     const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf,
149                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8,
150                              0xa7, 0xd8, 0xb4, 0xd8, 0xaf
151                             };
152     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
153     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
154 }
155 
156 /// passive present indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs15)157 TEST_F(PersianAnalyzerTest, testBehaviorVerbs15) {
158     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
159     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
160                              0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf
161                             };
162     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
163     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
164 }
165 
166 /// passive preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs16)167 TEST_F(PersianAnalyzerTest, testBehaviorVerbs16) {
168     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
169     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf};
170     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
171     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
172 }
173 
174 /// passive imperfective preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs17)175 TEST_F(PersianAnalyzerTest, testBehaviorVerbs17) {
176     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
177     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
178                              0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf
179                             };
180     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
181     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
182 }
183 
184 /// passive perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs18)185 TEST_F(PersianAnalyzerTest, testBehaviorVerbs18) {
186     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
187     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
188                              0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
189                             };
190     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
191     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
192 }
193 
194 /// passive imperfective perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs19)195 TEST_F(PersianAnalyzerTest, testBehaviorVerbs19) {
196     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
197     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
198                              0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8,
199                              0xb3, 0xd8, 0xaa
200                             };
201     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
202     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
203 }
204 
205 /// passive pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs20)206 TEST_F(PersianAnalyzerTest, testBehaviorVerbs20) {
207     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
208     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
209                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
210                             };
211     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
212     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
213 }
214 
215 /// passive imperfective pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs21)216 TEST_F(PersianAnalyzerTest, testBehaviorVerbs21) {
217     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
218     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
219                              0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8,
220                              0xaf
221                             };
222     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
223     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
224 }
225 
226 /// passive future indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs22)227 TEST_F(PersianAnalyzerTest, testBehaviorVerbs22) {
228     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
229     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xae, 0xd9, 0x88,
230                              0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xb4, 0xd8, 0xaf
231                             };
232     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
233     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
234 }
235 
236 /// passive present progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs23)237 TEST_F(PersianAnalyzerTest, testBehaviorVerbs23) {
238     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
239     const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1,
240                              0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd9,
241                              0x88, 0xd8, 0xaf
242                             };
243     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
244     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
245 }
246 
247 /// passive preterite progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs24)248 TEST_F(PersianAnalyzerTest, testBehaviorVerbs24) {
249     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
250     const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1,
251                              0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8,
252                              0xaf
253                             };
254     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
255     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
256 }
257 
258 /// passive present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs25)259 TEST_F(PersianAnalyzerTest, testBehaviorVerbs25) {
260     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
261     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd9, 0x88,
262                              0xd8, 0xaf
263                             };
264     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
265     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
266 }
267 
268 /// passive preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs26)269 TEST_F(PersianAnalyzerTest, testBehaviorVerbs26) {
270     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
271     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
272                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
273                             };
274     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
275     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
276 }
277 
278 /// passive imperfective preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs27)279 TEST_F(PersianAnalyzerTest, testBehaviorVerbs27) {
280     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
281     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
282                              0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8,
283                              0xb4, 0xd8, 0xaf
284                             };
285     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
286     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
287 }
288 
289 /// passive pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs28)290 TEST_F(PersianAnalyzerTest, testBehaviorVerbs28) {
291     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
292     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
293                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8,
294                              0xa7, 0xd8, 0xb4, 0xd8, 0xaf
295                             };
296     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
297     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
298 }
299 
300 /// passive imperfective pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs29)301 TEST_F(PersianAnalyzerTest, testBehaviorVerbs29) {
302     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
303     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
304                              0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8,
305                              0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
306                             };
307     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
308     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
309 }
310 
311 /// active present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs30)312 TEST_F(PersianAnalyzerTest, testBehaviorVerbs30) {
313     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
314     const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
315     const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
316     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
317 }
318 
319 /// This test shows how the combination of tokenization and stopwords creates a
320 /// light-stemming effect for verbs.
321 ///
322 /// In this case, these forms are presented with alternative orthography, using
323 /// arabic yeh and whitespace. This yeh phenomenon is common for legacy text
324 /// due to some previous bugs in Microsoft Windows.
325 ///
326 /// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
327 
328 /// active present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective1)329 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective1) {
330     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
331     const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
332     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
333     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
334 }
335 
336 /// active preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective2)337 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective2) {
338     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
339     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
340     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
341     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
342 }
343 
344 /// active imperfective preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective3)345 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective3) {
346     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
347     const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
348     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
349     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
350 }
351 
352 /// active future indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective4)353 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective4) {
354     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
355     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88,
356                              0xd8, 0xb1, 0xd8, 0xaf
357                             };
358     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
359     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
360 }
361 
362 /// active present progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective5)363 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective5) {
364     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
365     const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8,
366                              0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf
367                             };
368     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
369     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
370 }
371 
372 /// active preterite progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective6)373 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective6) {
374     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
375     const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8,
376                              0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf
377                             };
378     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
379     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
380 }
381 
382 /// active perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective7)383 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective7) {
384     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
385     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3,
386                              0xd8, 0xaa
387                             };
388     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
389     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
390 }
391 
392 /// active imperfective perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective8)393 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective8) {
394     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
395     const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87,
396                              0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
397                             };
398     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
399     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
400 }
401 
402 /// active pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective9)403 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective9) {
404     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
405     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88,
406                              0xd8, 0xaf
407                             };
408     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
409     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
410 }
411 
412 /// active imperfective pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective10)413 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective10) {
414     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
415     const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87,
416                              0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
417                             };
418     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
419     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
420 }
421 
422 /// active preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective11)423 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective11) {
424     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
425     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7,
426                              0xd8, 0xb4, 0xd8, 0xaf
427                             };
428     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
429     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
430 }
431 
432 /// active imperfective preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective12)433 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective12) {
434     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
435     const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87,
436                              0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
437                             };
438     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
439     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
440 }
441 
442 /// active pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective13)443 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective13) {
444     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
445     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88,
446                              0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
447                             };
448     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
449     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
450 }
451 
452 /// active imperfective pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective14)453 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective14) {
454     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
455     const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87,
456                              0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8,
457                              0xb4, 0xd8, 0xaf
458                             };
459     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
460     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
461 }
462 
463 /// passive present indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective15)464 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective15) {
465     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
466     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
467                              0x20, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf
468                             };
469     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
470     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
471 }
472 
473 /// passive preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective16)474 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective16) {
475     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
476     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf};
477     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
478     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
479 }
480 
481 /// passive imperfective preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective17)482 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective17) {
483     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
484     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
485                              0x20, 0xd8, 0xb4, 0xd8, 0xaf
486                             };
487     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
488     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
489 }
490 
491 /// passive perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective18)492 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective18) {
493     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
494     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
495                              0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
496                             };
497     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
498     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
499 }
500 
501 /// passive imperfective perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective19)502 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective19) {
503     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
504     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
505                              0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
506                             };
507     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
508     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
509 }
510 
511 /// passive pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective20)512 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective20) {
513     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
514     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
515                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
516                             };
517     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
518     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
519 }
520 
521 /// passive imperfective pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective21)522 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective21) {
523     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
524     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
525                              0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
526                             };
527     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
528     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
529 }
530 
531 /// passive future indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective22)532 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective22) {
533     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
534     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xae, 0xd9, 0x88,
535                              0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xb4, 0xd8, 0xaf
536                             };
537     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
538     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
539 }
540 
541 /// passive present progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective23)542 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective23) {
543     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
544     const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1,
545                              0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd9, 0x88, 0xd8,
546                              0xaf
547                             };
548     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
549     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
550 }
551 
552 /// passive preterite progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective24)553 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective24) {
554     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
555     const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1,
556                              0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf
557                             };
558     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
559     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
560 }
561 
562 /// passive present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective25)563 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective25) {
564     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
565     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd9, 0x88,
566                              0xd8, 0xaf
567                             };
568     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
569     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
570 }
571 
572 /// passive preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective26)573 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective26) {
574     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
575     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
576                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
577                             };
578     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
579     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
580 }
581 
582 /// passive imperfective preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective27)583 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective27) {
584     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
585     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
586                              0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8,
587                              0xaf
588                             };
589     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
590     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
591 }
592 
593 /// passive pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective28)594 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective28) {
595     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
596     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
597                              0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8,
598                              0xa7, 0xd8, 0xb4, 0xd8, 0xaf
599                             };
600     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
601     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
602 }
603 
604 /// passive imperfective pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective29)605 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective29) {
606     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
607     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
608                              0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9,
609                              0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
610                             };
611     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
612     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
613 }
614 
615 /// active present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective30)616 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective30) {
617     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
618     const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
619     const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
620     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
621 }
622 
623 /// These tests show how the combination of tokenization (breaking on zero-width
624 /// non-joiner or space) and stopwords creates a light-stemming effect for
625 /// nouns, removing the plural -ha.
626 
TEST_F(PersianAnalyzerTest,testBehaviorNouns1)627 TEST_F(PersianAnalyzerTest, testBehaviorNouns1) {
628     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
629     const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0x20, 0xd9, 0x87, 0xd8, 0xa7};
630     const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf};
631     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
632 }
633 
TEST_F(PersianAnalyzerTest,testBehaviorNouns2)634 TEST_F(PersianAnalyzerTest, testBehaviorNouns2) {
635     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
636     const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0xe2, 0x80, 0x8c, 0xd9, 0x87, 0xd8, 0xa7};
637     const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf};
638     checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
639 }
640 
641 /// Test showing that non-Persian text is treated very much like SimpleAnalyzer (lowercased, etc)
TEST_F(PersianAnalyzerTest,testBehaviorNonPersian)642 TEST_F(PersianAnalyzerTest, testBehaviorNonPersian) {
643     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
644     checkAnalyzesTo(a, L"English test.", newCollection<String>(L"english", L"test"));
645 }
646 
TEST_F(PersianAnalyzerTest,testReusableTokenStream1)647 TEST_F(PersianAnalyzerTest, testReusableTokenStream1) {
648     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
649     const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
650                              0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9,
651                              0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
652                             };
653     const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
654     checkAnalyzesToReuse(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
655 }
656 
TEST_F(PersianAnalyzerTest,testReusableTokenStream2)657 TEST_F(PersianAnalyzerTest, testReusableTokenStream2) {
658     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
659     const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0xe2, 0x80, 0x8c, 0xd9, 0x87, 0xd8, 0xa7};
660     const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf};
661     checkAnalyzesToReuse(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
662 }
663 
664 /// Test that custom stopwords work, and are not case-sensitive.
TEST_F(PersianAnalyzerTest,testCustomStopwords)665 TEST_F(PersianAnalyzerTest, testCustomStopwords) {
666     Collection<String> stopWords = newCollection<String>(L"the", L"and", L"a");
667     PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT, HashSet<String>::newInstance(stopWords.begin(), stopWords.end()));
668     checkAnalyzesTo(a, L"The quick brown fox.", newCollection<String>(L"quick", L"brown", L"fox"));
669 }
670