1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6 
7 #include "TestInc.h"
8 #include "BaseTokenStreamFixture.h"
9 #include "ArabicLetterTokenizer.h"
10 #include "PersianNormalizationFilter.h"
11 #include "StringReader.h"
12 
13 using namespace Lucene;
14 
15 class PersianNormalizationFilterTest : public BaseTokenStreamFixture {
16 public:
~PersianNormalizationFilterTest()17     virtual ~PersianNormalizationFilterTest() {
18     }
19 
20 public:
check(const String & input,const String & expected)21     void check(const String& input, const String& expected) {
22         ArabicLetterTokenizerPtr tokenStream  = newLucene<ArabicLetterTokenizer>(newLucene<StringReader>(input));
23         PersianNormalizationFilterPtr filter = newLucene<PersianNormalizationFilter>(tokenStream);
24         checkTokenStreamContents(filter, newCollection<String>(expected));
25     }
26 };
27 
TEST_F(PersianNormalizationFilterTest,testFarsiYeh)28 TEST_F(PersianNormalizationFilterTest, testFarsiYeh) {
29     const uint8_t first[] = {0xd9, 0x87, 0xd8, 0xa7, 0xdb, 0x8c};
30     const uint8_t second[] = {0xd9, 0x87, 0xd8, 0xa7, 0xd9, 0x8a};
31     check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
32 }
33 
TEST_F(PersianNormalizationFilterTest,testYehBarree)34 TEST_F(PersianNormalizationFilterTest, testYehBarree) {
35     const uint8_t first[] = {0xd9, 0x87, 0xd8, 0xa7, 0xdb, 0x92};
36     const uint8_t second[] = {0xd9, 0x87, 0xd8, 0xa7, 0xd9, 0x8a};
37     check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
38 }
39 
TEST_F(PersianNormalizationFilterTest,testKeheh)40 TEST_F(PersianNormalizationFilterTest, testKeheh) {
41     const uint8_t first[] = {0xda, 0xa9, 0xd8, 0xb4, 0xd8, 0xa7, 0xd9, 0x86, 0xd8, 0xaf, 0xd9, 0x86};
42     const uint8_t second[] = {0xd9, 0x83, 0xd8, 0xb4, 0xd8, 0xa7, 0xd9, 0x86, 0xd8, 0xaf, 0xd9, 0x86};
43     check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
44 }
45 
TEST_F(PersianNormalizationFilterTest,testHehYeh)46 TEST_F(PersianNormalizationFilterTest, testHehYeh) {
47     const uint8_t first[] = {0xd9, 0x83, 0xd8, 0xaa, 0xd8, 0xa7, 0xd8, 0xa8, 0xdb, 0x80};
48     const uint8_t second[] = {0xd9, 0x83, 0xd8, 0xaa, 0xd8, 0xa7, 0xd8, 0xa8, 0xd9, 0x87};
49     check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
50 }
51 
TEST_F(PersianNormalizationFilterTest,testHehHamzaAbove)52 TEST_F(PersianNormalizationFilterTest, testHehHamzaAbove) {
53     const uint8_t first[] = {0xd9, 0x83, 0xd8, 0xaa, 0xd8, 0xa7, 0xd8, 0xa8, 0xd9, 0x87, 0xd9, 0x94};
54     const uint8_t second[] = {0xd9, 0x83, 0xd8, 0xaa, 0xd8, 0xa7, 0xd8, 0xa8, 0xd9, 0x87};
55     check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
56 }
57 
TEST_F(PersianNormalizationFilterTest,testHehGoal)58 TEST_F(PersianNormalizationFilterTest, testHehGoal) {
59     const uint8_t first[] = {0xd8, 0xb2, 0xd8, 0xa7, 0xd8, 0xaf, 0xdb, 0x81};
60     const uint8_t second[] = {0xd8, 0xb2, 0xd8, 0xa7, 0xd8, 0xaf, 0xd9, 0x87};
61     check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
62 }
63