1//  Copyright (c) 2014 Couchbase, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// 		http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package ar
16
17import (
18	"bytes"
19
20	"github.com/blevesearch/bleve/analysis"
21	"github.com/blevesearch/bleve/registry"
22)
23
24const StemmerName = "stemmer_ar"
25
26// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
27var prefixes = [][]rune{
28	[]rune("ال"),
29	[]rune("وال"),
30	[]rune("بال"),
31	[]rune("كال"),
32	[]rune("فال"),
33	[]rune("لل"),
34	[]rune("و"),
35}
36var suffixes = [][]rune{
37	[]rune("ها"),
38	[]rune("ان"),
39	[]rune("ات"),
40	[]rune("ون"),
41	[]rune("ين"),
42	[]rune("يه"),
43	[]rune("ية"),
44	[]rune("ه"),
45	[]rune("ة"),
46	[]rune("ي"),
47}
48
49type ArabicStemmerFilter struct{}
50
51func NewArabicStemmerFilter() *ArabicStemmerFilter {
52	return &ArabicStemmerFilter{}
53}
54
55func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
56	for _, token := range input {
57		term := stem(token.Term)
58		token.Term = term
59	}
60	return input
61}
62
63func canStemPrefix(input, prefix []rune) bool {
64	// Wa- prefix requires at least 3 characters.
65	if len(prefix) == 1 && len(input) < 4 {
66		return false
67	}
68	// Other prefixes require only 2.
69	if len(input)-len(prefix) < 2 {
70		return false
71	}
72	for i := range prefix {
73		if prefix[i] != input[i] {
74			return false
75		}
76	}
77	return true
78}
79
80func canStemSuffix(input, suffix []rune) bool {
81	// All suffixes require at least 2 characters after stemming.
82	if len(input)-len(suffix) < 2 {
83		return false
84	}
85	stemEnd := len(input) - len(suffix)
86	for i := range suffix {
87		if suffix[i] != input[stemEnd+i] {
88			return false
89		}
90	}
91	return true
92}
93
94func stem(input []byte) []byte {
95	runes := bytes.Runes(input)
96	// Strip a single prefix.
97	for _, p := range prefixes {
98		if canStemPrefix(runes, p) {
99			runes = runes[len(p):]
100			break
101		}
102	}
103	// Strip off multiple suffixes, in their order in the suffixes array.
104	for _, s := range suffixes {
105		if canStemSuffix(runes, s) {
106			runes = runes[:len(runes)-len(s)]
107		}
108	}
109	return analysis.BuildTermFromRunes(runes)
110}
111
112func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
113	return NewArabicStemmerFilter(), nil
114}
115
116func init() {
117	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
118}
119