1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6
7 #include "TestInc.h"
8 #include "BaseTokenStreamFixture.h"
9 #include "ArabicLetterTokenizer.h"
10 #include "PersianNormalizationFilter.h"
11 #include "StringReader.h"
12
13 using namespace Lucene;
14
15 class PersianNormalizationFilterTest : public BaseTokenStreamFixture {
16 public:
~PersianNormalizationFilterTest()17 virtual ~PersianNormalizationFilterTest() {
18 }
19
20 public:
check(const String & input,const String & expected)21 void check(const String& input, const String& expected) {
22 ArabicLetterTokenizerPtr tokenStream = newLucene<ArabicLetterTokenizer>(newLucene<StringReader>(input));
23 PersianNormalizationFilterPtr filter = newLucene<PersianNormalizationFilter>(tokenStream);
24 checkTokenStreamContents(filter, newCollection<String>(expected));
25 }
26 };
27
TEST_F(PersianNormalizationFilterTest,testFarsiYeh)28 TEST_F(PersianNormalizationFilterTest, testFarsiYeh) {
29 const uint8_t first[] = {0xd9, 0x87, 0xd8, 0xa7, 0xdb, 0x8c};
30 const uint8_t second[] = {0xd9, 0x87, 0xd8, 0xa7, 0xd9, 0x8a};
31 check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
32 }
33
TEST_F(PersianNormalizationFilterTest,testYehBarree)34 TEST_F(PersianNormalizationFilterTest, testYehBarree) {
35 const uint8_t first[] = {0xd9, 0x87, 0xd8, 0xa7, 0xdb, 0x92};
36 const uint8_t second[] = {0xd9, 0x87, 0xd8, 0xa7, 0xd9, 0x8a};
37 check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
38 }
39
TEST_F(PersianNormalizationFilterTest,testKeheh)40 TEST_F(PersianNormalizationFilterTest, testKeheh) {
41 const uint8_t first[] = {0xda, 0xa9, 0xd8, 0xb4, 0xd8, 0xa7, 0xd9, 0x86, 0xd8, 0xaf, 0xd9, 0x86};
42 const uint8_t second[] = {0xd9, 0x83, 0xd8, 0xb4, 0xd8, 0xa7, 0xd9, 0x86, 0xd8, 0xaf, 0xd9, 0x86};
43 check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
44 }
45
TEST_F(PersianNormalizationFilterTest,testHehYeh)46 TEST_F(PersianNormalizationFilterTest, testHehYeh) {
47 const uint8_t first[] = {0xd9, 0x83, 0xd8, 0xaa, 0xd8, 0xa7, 0xd8, 0xa8, 0xdb, 0x80};
48 const uint8_t second[] = {0xd9, 0x83, 0xd8, 0xaa, 0xd8, 0xa7, 0xd8, 0xa8, 0xd9, 0x87};
49 check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
50 }
51
TEST_F(PersianNormalizationFilterTest,testHehHamzaAbove)52 TEST_F(PersianNormalizationFilterTest, testHehHamzaAbove) {
53 const uint8_t first[] = {0xd9, 0x83, 0xd8, 0xaa, 0xd8, 0xa7, 0xd8, 0xa8, 0xd9, 0x87, 0xd9, 0x94};
54 const uint8_t second[] = {0xd9, 0x83, 0xd8, 0xaa, 0xd8, 0xa7, 0xd8, 0xa8, 0xd9, 0x87};
55 check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
56 }
57
TEST_F(PersianNormalizationFilterTest,testHehGoal)58 TEST_F(PersianNormalizationFilterTest, testHehGoal) {
59 const uint8_t first[] = {0xd8, 0xb2, 0xd8, 0xa7, 0xd8, 0xaf, 0xdb, 0x81};
60 const uint8_t second[] = {0xd8, 0xb2, 0xd8, 0xa7, 0xd8, 0xaf, 0xd9, 0x87};
61 check(UTF8_TO_STRING(first), UTF8_TO_STRING(second));
62 }
63