1//  Copyright (c) 2014 Couchbase, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// 		http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package ar
16
17import (
18	"bytes"
19
20	"github.com/blevesearch/bleve/analysis"
21	"github.com/blevesearch/bleve/registry"
22)
23
24const NormalizeName = "normalize_ar"
25
26const (
27	Alef           = '\u0627'
28	AlefMadda      = '\u0622'
29	AlefHamzaAbove = '\u0623'
30	AlefHamzaBelow = '\u0625'
31	Yeh            = '\u064A'
32	DotlessYeh     = '\u0649'
33	TehMarbuta     = '\u0629'
34	Heh            = '\u0647'
35	Tatweel        = '\u0640'
36	Fathatan       = '\u064B'
37	Dammatan       = '\u064C'
38	Kasratan       = '\u064D'
39	Fatha          = '\u064E'
40	Damma          = '\u064F'
41	Kasra          = '\u0650'
42	Shadda         = '\u0651'
43	Sukun          = '\u0652'
44)
45
46type ArabicNormalizeFilter struct {
47}
48
49func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
50	return &ArabicNormalizeFilter{}
51}
52
53func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
54	for _, token := range input {
55		term := normalize(token.Term)
56		token.Term = term
57	}
58	return input
59}
60
61func normalize(input []byte) []byte {
62	runes := bytes.Runes(input)
63	for i := 0; i < len(runes); i++ {
64		switch runes[i] {
65		case AlefMadda, AlefHamzaAbove, AlefHamzaBelow:
66			runes[i] = Alef
67		case DotlessYeh:
68			runes[i] = Yeh
69		case TehMarbuta:
70			runes[i] = Heh
71		case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
72			runes = analysis.DeleteRune(runes, i)
73			i--
74		}
75	}
76	return analysis.BuildTermFromRunes(runes)
77}
78
79func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
80	return NewArabicNormalizeFilter(), nil
81}
82
83func init() {
84	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
85}
86