1// Copyright (c) 2014 Couchbase, Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package ar 16 17import ( 18 "reflect" 19 "testing" 20 21 "github.com/blevesearch/bleve/analysis" 22 "github.com/blevesearch/bleve/registry" 23) 24 25func TestArabicAnalyzer(t *testing.T) { 26 tests := []struct { 27 input []byte 28 output analysis.TokenStream 29 }{ 30 { 31 input: []byte("كبير"), 32 output: analysis.TokenStream{ 33 &analysis.Token{ 34 Term: []byte("كبير"), 35 Position: 1, 36 Start: 0, 37 End: 8, 38 }, 39 }, 40 }, 41 // feminine marker 42 { 43 input: []byte("كبيرة"), 44 output: analysis.TokenStream{ 45 &analysis.Token{ 46 Term: []byte("كبير"), 47 Position: 1, 48 Start: 0, 49 End: 10, 50 }, 51 }, 52 }, 53 { 54 input: []byte("مشروب"), 55 output: analysis.TokenStream{ 56 &analysis.Token{ 57 Term: []byte("مشروب"), 58 Position: 1, 59 Start: 0, 60 End: 10, 61 }, 62 }, 63 }, 64 // plural -at 65 { 66 input: []byte("مشروبات"), 67 output: analysis.TokenStream{ 68 &analysis.Token{ 69 Term: []byte("مشروب"), 70 Position: 1, 71 Start: 0, 72 End: 14, 73 }, 74 }, 75 }, 76 // plural -in 77 { 78 input: []byte("أمريكيين"), 79 output: analysis.TokenStream{ 80 &analysis.Token{ 81 Term: []byte("امريك"), 82 Position: 1, 83 Start: 0, 84 End: 16, 85 }, 86 }, 87 }, 88 // singular with bare alif 89 { 90 input: []byte("امريكي"), 91 output: analysis.TokenStream{ 92 &analysis.Token{ 93 Term: []byte("امريك"), 94 Position: 1, 95 Start: 0, 96 End: 12, 97 }, 98 }, 99 }, 100 { 101 input: []byte("كتاب"), 102 output: analysis.TokenStream{ 103 &analysis.Token{ 104 Term: []byte("كتاب"), 105 Position: 1, 106 Start: 0, 107 End: 8, 108 }, 109 }, 110 }, 111 // definite article 112 { 113 input: []byte("الكتاب"), 114 output: analysis.TokenStream{ 115 &analysis.Token{ 116 Term: []byte("كتاب"), 117 Position: 1, 118 Start: 0, 119 End: 12, 120 }, 121 }, 122 }, 123 { 124 input: []byte("ما ملكت أيمانكم"), 125 output: analysis.TokenStream{ 126 &analysis.Token{ 127 Term: []byte("ملكت"), 128 Position: 2, 129 Start: 5, 130 End: 13, 131 }, 132 &analysis.Token{ 133 Term: []byte("ايمانكم"), 134 Position: 3, 135 Start: 14, 136 End: 28, 137 }, 138 }, 139 }, 140 // stopwords 141 { 142 input: []byte("الذين ملكت أيمانكم"), 143 output: analysis.TokenStream{ 144 &analysis.Token{ 145 Term: []byte("ملكت"), 146 Position: 2, 147 Start: 11, 148 End: 19, 149 }, 150 &analysis.Token{ 151 Term: []byte("ايمانكم"), 152 Position: 3, 153 Start: 20, 154 End: 34, 155 }, 156 }, 157 }, 158 // presentation form normalization 159 { 160 input: []byte("ﺍﻟﺴﻼﻢ"), 161 output: analysis.TokenStream{ 162 &analysis.Token{ 163 Term: []byte("سلام"), 164 Position: 1, 165 Start: 0, 166 End: 15, 167 }, 168 }, 169 }, 170 } 171 172 cache := registry.NewCache() 173 analyzer, err := cache.AnalyzerNamed(AnalyzerName) 174 if err != nil { 175 t.Fatal(err) 176 } 177 for _, test := range tests { 178 actual := analyzer.Analyze(test.input) 179 if !reflect.DeepEqual(actual, test.output) { 180 t.Errorf("expected %v, got %v", test.output, actual) 181 t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term) 182 } 183 } 184} 185