1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search
6
7// Package search provides language-specific search and string matching.
8//
9// Natural language matching can be intricate. For example, Danish will insist
10// "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note
11// the lack of a dot) in a case-insensitive match. This package handles such
12// language-specific details.
13//
14// Text passed to any of the calls in this message does not need to be
15// normalized.
16package search // import "golang.org/x/text/search"
17
18import (
19	"strings"
20
21	"golang.org/x/text/internal/colltab"
22	"golang.org/x/text/language"
23)
24
25// An Option configures a Matcher.
26type Option func(*Matcher)
27
28var (
29	// WholeWord restricts matches to complete words. The default is to match at
30	// the character level.
31	WholeWord Option = nil
32
33	// Exact requires that two strings are their exact equivalent. For example
34	// å would not match aa in Danish. It overrides any of the ignore options.
35	Exact Option = nil
36
37	// Loose causes case, diacritics and width to be ignored.
38	Loose Option = loose
39
40	// IgnoreCase enables case-insensitive search.
41	IgnoreCase Option = ignoreCase
42
43	// IgnoreDiacritics causes diacritics to be ignored ("ö" == "o").
44	IgnoreDiacritics Option = ignoreDiacritics
45
46	// IgnoreWidth equates narrow with wide variants.
47	IgnoreWidth Option = ignoreWidth
48)
49
50func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true }
51func ignoreCase(m *Matcher)       { m.ignoreCase = true }
52func ignoreWidth(m *Matcher)      { m.ignoreWidth = true }
53func loose(m *Matcher) {
54	ignoreDiacritics(m)
55	ignoreCase(m)
56	ignoreWidth(m)
57}
58
59var (
60	// Supported lists the languages for which search differs from its parent.
61	Supported language.Coverage
62
63	tags []language.Tag
64)
65
66func init() {
67	ids := strings.Split(availableLocales, ",")
68	tags = make([]language.Tag, len(ids))
69	for i, s := range ids {
70		tags[i] = language.Raw.MustParse(s)
71	}
72	Supported = language.NewCoverage(tags)
73}
74
75// New returns a new Matcher for the given language and options.
76func New(t language.Tag, opts ...Option) *Matcher {
77	m := &Matcher{
78		w: getTable(locales[colltab.MatchLang(t, tags)]),
79	}
80	for _, f := range opts {
81		f(m)
82	}
83	return m
84}
85
86// A Matcher implements language-specific string matching.
87type Matcher struct {
88	w                colltab.Weighter
89	ignoreCase       bool
90	ignoreWidth      bool
91	ignoreDiacritics bool
92}
93
94// An IndexOption specifies how the Index methods of Pattern or Matcher should
95// match the input.
96type IndexOption byte
97
98const (
99	// Anchor restricts the search to the start (or end for Backwards) of the
100	// text.
101	Anchor IndexOption = 1 << iota
102
103	// Backwards starts the search from the end of the text.
104	Backwards
105
106	anchorBackwards = Anchor | Backwards
107)
108
109// Index reports the start and end position of the first occurrence of pat in b
110// or -1, -1 if pat is not present.
111func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) {
112	// TODO: implement optimized version that does not use a pattern.
113	return m.Compile(pat).Index(b, opts...)
114}
115
116// IndexString reports the start and end position of the first occurrence of pat
117// in s or -1, -1 if pat is not present.
118func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) {
119	// TODO: implement optimized version that does not use a pattern.
120	return m.CompileString(pat).IndexString(s, opts...)
121}
122
123// Equal reports whether a and b are equivalent.
124func (m *Matcher) Equal(a, b []byte) bool {
125	_, end := m.Index(a, b, Anchor)
126	return end == len(a)
127}
128
129// EqualString reports whether a and b are equivalent.
130func (m *Matcher) EqualString(a, b string) bool {
131	_, end := m.IndexString(a, b, Anchor)
132	return end == len(a)
133}
134
135// Compile compiles and returns a pattern that can be used for faster searching.
136func (m *Matcher) Compile(b []byte) *Pattern {
137	p := &Pattern{m: m}
138	iter := colltab.Iter{Weighter: m.w}
139	for iter.SetInput(b); iter.Next(); {
140	}
141	p.ce = iter.Elems
142	p.deleteEmptyElements()
143	return p
144}
145
146// CompileString compiles and returns a pattern that can be used for faster
147// searching.
148func (m *Matcher) CompileString(s string) *Pattern {
149	p := &Pattern{m: m}
150	iter := colltab.Iter{Weighter: m.w}
151	for iter.SetInputString(s); iter.Next(); {
152	}
153	p.ce = iter.Elems
154	p.deleteEmptyElements()
155	return p
156}
157
158// A Pattern is a compiled search string. It is safe for concurrent use.
159type Pattern struct {
160	m  *Matcher
161	ce []colltab.Elem
162}
163
164// Design note (TODO remove):
165// The cost of retrieving collation elements for each rune, which is used for
166// search as well, is not trivial. Also, algorithms like Boyer-Moore and
167// Sunday require some additional precomputing.
168
169// Index reports the start and end position of the first occurrence of p in b
170// or -1, -1 if p is not present.
171func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) {
172	// Pick a large enough buffer such that we likely do not need to allocate
173	// and small enough to not cause too much overhead initializing.
174	var buf [8]colltab.Elem
175
176	it := &colltab.Iter{
177		Weighter: p.m.w,
178		Elems:    buf[:0],
179	}
180	it.SetInput(b)
181
182	var optMask IndexOption
183	for _, o := range opts {
184		optMask |= o
185	}
186
187	switch optMask {
188	case 0:
189		return p.forwardSearch(it)
190	case Anchor:
191		return p.anchoredForwardSearch(it)
192	case Backwards, anchorBackwards:
193		panic("TODO: implement")
194	default:
195		panic("unrecognized option")
196	}
197}
198
199// IndexString reports the start and end position of the first occurrence of p
200// in s or -1, -1 if p is not present.
201func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) {
202	// Pick a large enough buffer such that we likely do not need to allocate
203	// and small enough to not cause too much overhead initializing.
204	var buf [8]colltab.Elem
205
206	it := &colltab.Iter{
207		Weighter: p.m.w,
208		Elems:    buf[:0],
209	}
210	it.SetInputString(s)
211
212	var optMask IndexOption
213	for _, o := range opts {
214		optMask |= o
215	}
216
217	switch optMask {
218	case 0:
219		return p.forwardSearch(it)
220	case Anchor:
221		return p.anchoredForwardSearch(it)
222	case Backwards, anchorBackwards:
223		panic("TODO: implement")
224	default:
225		panic("unrecognized option")
226	}
227}
228
229// TODO:
230// - Maybe IndexAll methods (probably not necessary).
231// - Some way to match patterns in a Reader (a bit tricky).
232// - Some fold transformer that folds text to comparable text, based on the
233//   search options. This is a common technique, though very different from the
234//   collation-based design of this package. It has a somewhat different use
235//   case, so probably makes sense to support both. Should probably be in a
236//   different package, though, as it uses completely different kind of tables
237//   (based on norm, cases, width and range tables.)
238