///////////////////////////////////////////////////////////////////////////// // Copyright (c) 2009-2014 Alan Wright. All rights reserved. // Distributable under the terms of either the Apache License (Version 2.0) // or the GNU Lesser General Public License. ///////////////////////////////////////////////////////////////////////////// #include "TestInc.h" #include "BaseTokenStreamFixture.h" #include "PersianAnalyzer.h" using namespace Lucene; typedef BaseTokenStreamFixture PersianAnalyzerTest; /// These tests show how the combination of tokenization (breaking on zero-width /// non-joiner), normalization (such as treating arabic YEH and farsi YEH the /// same), and stopwords creates a light-stemming effect for verbs. /// /// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar /// active present indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs1) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active preterite indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs2) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective preterite indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs3) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active future indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs4) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active present progressive indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs5) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active preterite progressive indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs6) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active perfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs7) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective perfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs8) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active pluperfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs9) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective pluperfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs10) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active preterite subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs11) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective preterite subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs12) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active pluperfect subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs13) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective pluperfect subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs14) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive present indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs15) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive preterite indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs16) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective preterite indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs17) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive perfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs18) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective perfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs19) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive pluperfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs20) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective pluperfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs21) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive future indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs22) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive present progressive indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs23) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive preterite progressive indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbs24) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive present subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs25) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive preterite subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs26) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective preterite subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs27) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive pluperfect subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs28) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective pluperfect subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs29) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active present subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbs30) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// This test shows how the combination of tokenization and stopwords creates a /// light-stemming effect for verbs. /// /// In this case, these forms are presented with alternative orthography, using /// arabic yeh and whitespace. This yeh phenomenon is common for legacy text /// due to some previous bugs in Microsoft Windows. /// /// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar /// active present subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective1) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active preterite indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective2) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective preterite indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective3) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active future indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective4) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active present progressive indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective5) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active preterite progressive indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective6) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active perfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective7) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective perfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective8) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active pluperfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective9) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective pluperfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective10) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active preterite subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective11) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective preterite subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective12) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active pluperfect subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective13) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active imperfective pluperfect subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective14) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive present indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective15) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive preterite indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective16) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective preterite indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective17) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive perfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective18) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective perfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective19) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive pluperfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective20) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective pluperfect indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective21) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive future indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective22) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive present progressive indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective23) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive preterite progressive indicative TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective24) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive present subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective25) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive preterite subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective26) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective preterite subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective27) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive pluperfect subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective28) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// passive imperfective pluperfect subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective29) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// active present subjunctive TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective30) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// These tests show how the combination of tokenization (breaking on zero-width /// non-joiner or space) and stopwords creates a light-stemming effect for /// nouns, removing the plural -ha. TEST_F(PersianAnalyzerTest, testBehaviorNouns1) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0x20, 0xd9, 0x87, 0xd8, 0xa7}; const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } TEST_F(PersianAnalyzerTest, testBehaviorNouns2) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0xe2, 0x80, 0x8c, 0xd9, 0x87, 0xd8, 0xa7}; const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf}; checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// Test showing that non-Persian text is treated very much like SimpleAnalyzer (lowercased, etc) TEST_F(PersianAnalyzerTest, testBehaviorNonPersian) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); checkAnalyzesTo(a, L"English test.", newCollection(L"english", L"test")); } TEST_F(PersianAnalyzerTest, testReusableTokenStream1) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf }; const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87}; checkAnalyzesToReuse(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } TEST_F(PersianAnalyzerTest, testReusableTokenStream2) { PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT); const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0xe2, 0x80, 0x8c, 0xd9, 0x87, 0xd8, 0xa7}; const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf}; checkAnalyzesToReuse(a, UTF8_TO_STRING(first), newCollection(UTF8_TO_STRING(second))); } /// Test that custom stopwords work, and are not case-sensitive. TEST_F(PersianAnalyzerTest, testCustomStopwords) { Collection stopWords = newCollection(L"the", L"and", L"a"); PersianAnalyzerPtr a = newLucene(LuceneVersion::LUCENE_CURRENT, HashSet::newInstance(stopWords.begin(), stopWords.end())); checkAnalyzesTo(a, L"The quick brown fox.", newCollection(L"quick", L"brown", L"fox")); }