1// Copyright (c) 2014 Couchbase, Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package ar 16 17import ( 18 "reflect" 19 "testing" 20 21 "github.com/blevesearch/bleve/analysis" 22) 23 24func TestArabicNormalizeFilter(t *testing.T) { 25 tests := []struct { 26 input analysis.TokenStream 27 output analysis.TokenStream 28 }{ 29 // AlifMadda 30 { 31 input: analysis.TokenStream{ 32 &analysis.Token{ 33 Term: []byte("آجن"), 34 }, 35 }, 36 output: analysis.TokenStream{ 37 &analysis.Token{ 38 Term: []byte("اجن"), 39 }, 40 }, 41 }, 42 // AlifHamzaAbove 43 { 44 input: analysis.TokenStream{ 45 &analysis.Token{ 46 Term: []byte("أحمد"), 47 }, 48 }, 49 output: analysis.TokenStream{ 50 &analysis.Token{ 51 Term: []byte("احمد"), 52 }, 53 }, 54 }, 55 // AlifHamzaBelow 56 { 57 input: analysis.TokenStream{ 58 &analysis.Token{ 59 Term: []byte("إعاذ"), 60 }, 61 }, 62 output: analysis.TokenStream{ 63 &analysis.Token{ 64 Term: []byte("اعاذ"), 65 }, 66 }, 67 }, 68 // AlifMaksura 69 { 70 input: analysis.TokenStream{ 71 &analysis.Token{ 72 Term: []byte("بنى"), 73 }, 74 }, 75 output: analysis.TokenStream{ 76 &analysis.Token{ 77 Term: []byte("بني"), 78 }, 79 }, 80 }, 81 // TehMarbuta 82 { 83 input: analysis.TokenStream{ 84 &analysis.Token{ 85 Term: []byte("فاطمة"), 86 }, 87 }, 88 output: analysis.TokenStream{ 89 &analysis.Token{ 90 Term: []byte("فاطمه"), 91 }, 92 }, 93 }, 94 // Tatweel 95 { 96 input: analysis.TokenStream{ 97 &analysis.Token{ 98 Term: []byte("روبرـــــت"), 99 }, 100 }, 101 output: analysis.TokenStream{ 102 &analysis.Token{ 103 Term: []byte("روبرت"), 104 }, 105 }, 106 }, 107 // Fatha 108 { 109 input: analysis.TokenStream{ 110 &analysis.Token{ 111 Term: []byte("مَبنا"), 112 }, 113 }, 114 output: analysis.TokenStream{ 115 &analysis.Token{ 116 Term: []byte("مبنا"), 117 }, 118 }, 119 }, 120 // Kasra 121 { 122 input: analysis.TokenStream{ 123 &analysis.Token{ 124 Term: []byte("علِي"), 125 }, 126 }, 127 output: analysis.TokenStream{ 128 &analysis.Token{ 129 Term: []byte("علي"), 130 }, 131 }, 132 }, 133 // Damma 134 { 135 input: analysis.TokenStream{ 136 &analysis.Token{ 137 Term: []byte("بُوات"), 138 }, 139 }, 140 output: analysis.TokenStream{ 141 &analysis.Token{ 142 Term: []byte("بوات"), 143 }, 144 }, 145 }, 146 // Fathatan 147 { 148 input: analysis.TokenStream{ 149 &analysis.Token{ 150 Term: []byte("ولداً"), 151 }, 152 }, 153 output: analysis.TokenStream{ 154 &analysis.Token{ 155 Term: []byte("ولدا"), 156 }, 157 }, 158 }, 159 // Kasratan 160 { 161 input: analysis.TokenStream{ 162 &analysis.Token{ 163 Term: []byte("ولدٍ"), 164 }, 165 }, 166 output: analysis.TokenStream{ 167 &analysis.Token{ 168 Term: []byte("ولد"), 169 }, 170 }, 171 }, 172 // Dammatan 173 { 174 input: analysis.TokenStream{ 175 &analysis.Token{ 176 Term: []byte("ولدٌ"), 177 }, 178 }, 179 output: analysis.TokenStream{ 180 &analysis.Token{ 181 Term: []byte("ولد"), 182 }, 183 }, 184 }, 185 // Sukun 186 { 187 input: analysis.TokenStream{ 188 &analysis.Token{ 189 Term: []byte("نلْسون"), 190 }, 191 }, 192 output: analysis.TokenStream{ 193 &analysis.Token{ 194 Term: []byte("نلسون"), 195 }, 196 }, 197 }, 198 // Shaddah 199 { 200 input: analysis.TokenStream{ 201 &analysis.Token{ 202 Term: []byte("هتميّ"), 203 }, 204 }, 205 output: analysis.TokenStream{ 206 &analysis.Token{ 207 Term: []byte("هتمي"), 208 }, 209 }, 210 }, 211 // empty 212 { 213 input: analysis.TokenStream{ 214 &analysis.Token{ 215 Term: []byte(""), 216 }, 217 }, 218 output: analysis.TokenStream{ 219 &analysis.Token{ 220 Term: []byte(""), 221 }, 222 }, 223 }, 224 } 225 226 arabicNormalizeFilter := NewArabicNormalizeFilter() 227 for _, test := range tests { 228 actual := arabicNormalizeFilter.Filter(test.input) 229 if !reflect.DeepEqual(actual, test.output) { 230 t.Errorf("expected %#v, got %#v", test.output, actual) 231 t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term) 232 } 233 } 234} 235