1// Copyright (c) 2014 Couchbase, Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package ar 16 17import ( 18 "bytes" 19 20 "github.com/blevesearch/bleve/analysis" 21 "github.com/blevesearch/bleve/registry" 22) 23 24const NormalizeName = "normalize_ar" 25 26const ( 27 Alef = '\u0627' 28 AlefMadda = '\u0622' 29 AlefHamzaAbove = '\u0623' 30 AlefHamzaBelow = '\u0625' 31 Yeh = '\u064A' 32 DotlessYeh = '\u0649' 33 TehMarbuta = '\u0629' 34 Heh = '\u0647' 35 Tatweel = '\u0640' 36 Fathatan = '\u064B' 37 Dammatan = '\u064C' 38 Kasratan = '\u064D' 39 Fatha = '\u064E' 40 Damma = '\u064F' 41 Kasra = '\u0650' 42 Shadda = '\u0651' 43 Sukun = '\u0652' 44) 45 46type ArabicNormalizeFilter struct { 47} 48 49func NewArabicNormalizeFilter() *ArabicNormalizeFilter { 50 return &ArabicNormalizeFilter{} 51} 52 53func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 54 for _, token := range input { 55 term := normalize(token.Term) 56 token.Term = term 57 } 58 return input 59} 60 61func normalize(input []byte) []byte { 62 runes := bytes.Runes(input) 63 for i := 0; i < len(runes); i++ { 64 switch runes[i] { 65 case AlefMadda, AlefHamzaAbove, AlefHamzaBelow: 66 runes[i] = Alef 67 case DotlessYeh: 68 runes[i] = Yeh 69 case TehMarbuta: 70 runes[i] = Heh 71 case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun: 72 runes = analysis.DeleteRune(runes, i) 73 i-- 74 } 75 } 76 return analysis.BuildTermFromRunes(runes) 77} 78 79func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { 80 return NewArabicNormalizeFilter(), nil 81} 82 83func init() { 84 registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor) 85} 86