1//  Copyright (c) 2014 Couchbase, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// 		http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package ar
16
17import (
18	"reflect"
19	"testing"
20
21	"github.com/blevesearch/bleve/analysis"
22	"github.com/blevesearch/bleve/registry"
23)
24
25func TestArabicAnalyzer(t *testing.T) {
26	tests := []struct {
27		input  []byte
28		output analysis.TokenStream
29	}{
30		{
31			input: []byte("كبير"),
32			output: analysis.TokenStream{
33				&analysis.Token{
34					Term:     []byte("كبير"),
35					Position: 1,
36					Start:    0,
37					End:      8,
38				},
39			},
40		},
41		// feminine marker
42		{
43			input: []byte("كبيرة"),
44			output: analysis.TokenStream{
45				&analysis.Token{
46					Term:     []byte("كبير"),
47					Position: 1,
48					Start:    0,
49					End:      10,
50				},
51			},
52		},
53		{
54			input: []byte("مشروب"),
55			output: analysis.TokenStream{
56				&analysis.Token{
57					Term:     []byte("مشروب"),
58					Position: 1,
59					Start:    0,
60					End:      10,
61				},
62			},
63		},
64		// plural -at
65		{
66			input: []byte("مشروبات"),
67			output: analysis.TokenStream{
68				&analysis.Token{
69					Term:     []byte("مشروب"),
70					Position: 1,
71					Start:    0,
72					End:      14,
73				},
74			},
75		},
76		// plural -in
77		{
78			input: []byte("أمريكيين"),
79			output: analysis.TokenStream{
80				&analysis.Token{
81					Term:     []byte("امريك"),
82					Position: 1,
83					Start:    0,
84					End:      16,
85				},
86			},
87		},
88		// singular with bare alif
89		{
90			input: []byte("امريكي"),
91			output: analysis.TokenStream{
92				&analysis.Token{
93					Term:     []byte("امريك"),
94					Position: 1,
95					Start:    0,
96					End:      12,
97				},
98			},
99		},
100		{
101			input: []byte("كتاب"),
102			output: analysis.TokenStream{
103				&analysis.Token{
104					Term:     []byte("كتاب"),
105					Position: 1,
106					Start:    0,
107					End:      8,
108				},
109			},
110		},
111		// definite article
112		{
113			input: []byte("الكتاب"),
114			output: analysis.TokenStream{
115				&analysis.Token{
116					Term:     []byte("كتاب"),
117					Position: 1,
118					Start:    0,
119					End:      12,
120				},
121			},
122		},
123		{
124			input: []byte("ما ملكت أيمانكم"),
125			output: analysis.TokenStream{
126				&analysis.Token{
127					Term:     []byte("ملكت"),
128					Position: 2,
129					Start:    5,
130					End:      13,
131				},
132				&analysis.Token{
133					Term:     []byte("ايمانكم"),
134					Position: 3,
135					Start:    14,
136					End:      28,
137				},
138			},
139		},
140		// stopwords
141		{
142			input: []byte("الذين ملكت أيمانكم"),
143			output: analysis.TokenStream{
144				&analysis.Token{
145					Term:     []byte("ملكت"),
146					Position: 2,
147					Start:    11,
148					End:      19,
149				},
150				&analysis.Token{
151					Term:     []byte("ايمانكم"),
152					Position: 3,
153					Start:    20,
154					End:      34,
155				},
156			},
157		},
158		// presentation form normalization
159		{
160			input: []byte("ﺍﻟﺴﻼﻢ"),
161			output: analysis.TokenStream{
162				&analysis.Token{
163					Term:     []byte("سلام"),
164					Position: 1,
165					Start:    0,
166					End:      15,
167				},
168			},
169		},
170	}
171
172	cache := registry.NewCache()
173	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
174	if err != nil {
175		t.Fatal(err)
176	}
177	for _, test := range tests {
178		actual := analyzer.Analyze(test.input)
179		if !reflect.DeepEqual(actual, test.output) {
180			t.Errorf("expected %v, got %v", test.output, actual)
181			t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
182		}
183	}
184}
185