1// Copyright (c) 2014 Couchbase, Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package ar 16 17import ( 18 "bytes" 19 20 "github.com/blevesearch/bleve/analysis" 21 "github.com/blevesearch/bleve/registry" 22) 23 24const StemmerName = "stemmer_ar" 25 26// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer 27var prefixes = [][]rune{ 28 []rune("ال"), 29 []rune("وال"), 30 []rune("بال"), 31 []rune("كال"), 32 []rune("فال"), 33 []rune("لل"), 34 []rune("و"), 35} 36var suffixes = [][]rune{ 37 []rune("ها"), 38 []rune("ان"), 39 []rune("ات"), 40 []rune("ون"), 41 []rune("ين"), 42 []rune("يه"), 43 []rune("ية"), 44 []rune("ه"), 45 []rune("ة"), 46 []rune("ي"), 47} 48 49type ArabicStemmerFilter struct{} 50 51func NewArabicStemmerFilter() *ArabicStemmerFilter { 52 return &ArabicStemmerFilter{} 53} 54 55func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 56 for _, token := range input { 57 term := stem(token.Term) 58 token.Term = term 59 } 60 return input 61} 62 63func canStemPrefix(input, prefix []rune) bool { 64 // Wa- prefix requires at least 3 characters. 65 if len(prefix) == 1 && len(input) < 4 { 66 return false 67 } 68 // Other prefixes require only 2. 69 if len(input)-len(prefix) < 2 { 70 return false 71 } 72 for i := range prefix { 73 if prefix[i] != input[i] { 74 return false 75 } 76 } 77 return true 78} 79 80func canStemSuffix(input, suffix []rune) bool { 81 // All suffixes require at least 2 characters after stemming. 82 if len(input)-len(suffix) < 2 { 83 return false 84 } 85 stemEnd := len(input) - len(suffix) 86 for i := range suffix { 87 if suffix[i] != input[stemEnd+i] { 88 return false 89 } 90 } 91 return true 92} 93 94func stem(input []byte) []byte { 95 runes := bytes.Runes(input) 96 // Strip a single prefix. 97 for _, p := range prefixes { 98 if canStemPrefix(runes, p) { 99 runes = runes[len(p):] 100 break 101 } 102 } 103 // Strip off multiple suffixes, in their order in the suffixes array. 104 for _, s := range suffixes { 105 if canStemSuffix(runes, s) { 106 runes = runes[:len(runes)-len(s)] 107 } 108 } 109 return analysis.BuildTermFromRunes(runes) 110} 111 112func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { 113 return NewArabicStemmerFilter(), nil 114} 115 116func init() { 117 registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) 118} 119