text-0.3.2/search/search.go

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search

// Package search provides language-specific search and string matching.
//
// Natural language matching can be intricate. For example, Danish will insist
// "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note
// the lack of a dot) in a case-insensitive match. This package handles such
// language-specific details.
//
// Text passed to any of the calls in this message does not need to be
// normalized.
package search // import "golang.org/x/text/search"

import (
	"strings"

	"golang.org/x/text/internal/colltab"
	"golang.org/x/text/language"
)

// An Option configures a Matcher.
type Option func(*Matcher)

var (
	// WholeWord restricts matches to complete words. The default is to match at
	// the character level.
	WholeWord Option = nil

	// Exact requires that two strings are their exact equivalent. For example
	// å would not match aa in Danish. It overrides any of the ignore options.
	Exact Option = nil

	// Loose causes case, diacritics and width to be ignored.
	Loose Option = loose

	// IgnoreCase enables case-insensitive search.
	IgnoreCase Option = ignoreCase

	// IgnoreDiacritics causes diacritics to be ignored ("ö" == "o").
	IgnoreDiacritics Option = ignoreDiacritics

	// IgnoreWidth equates narrow with wide variants.
	IgnoreWidth Option = ignoreWidth
)

func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true }
func ignoreCase(m *Matcher)       { m.ignoreCase = true }
func ignoreWidth(m *Matcher)      { m.ignoreWidth = true }
func loose(m *Matcher) {
	ignoreDiacritics(m)
	ignoreCase(m)
	ignoreWidth(m)
}

var (
	// Supported lists the languages for which search differs from its parent.
	Supported language.Coverage

	tags []language.Tag
)

func init() {
	ids := strings.Split(availableLocales, ",")
	tags = make([]language.Tag, len(ids))
	for i, s := range ids {
		tags[i] = language.Raw.MustParse(s)
	}
	Supported = language.NewCoverage(tags)
}

// New returns a new Matcher for the given language and options.
func New(t language.Tag, opts ...Option) *Matcher {
	m := &Matcher{
		w: getTable(locales[colltab.MatchLang(t, tags)]),
	}
	for _, f := range opts {
		f(m)
	}
	return m
}

// A Matcher implements language-specific string matching.
type Matcher struct {
	w                colltab.Weighter
	ignoreCase       bool
	ignoreWidth      bool
	ignoreDiacritics bool
}

// An IndexOption specifies how the Index methods of Pattern or Matcher should
// match the input.
type IndexOption byte

const (
	// Anchor restricts the search to the start (or end for Backwards) of the
	// text.
	Anchor IndexOption = 1 << iota

	// Backwards starts the search from the end of the text.
	Backwards

	anchorBackwards = Anchor | Backwards
)

// Index reports the start and end position of the first occurrence of pat in b
// or -1, -1 if pat is not present.
func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) {
	// TODO: implement optimized version that does not use a pattern.
	return m.Compile(pat).Index(b, opts...)
}

// IndexString reports the start and end position of the first occurrence of pat
// in s or -1, -1 if pat is not present.
func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) {
	// TODO: implement optimized version that does not use a pattern.
	return m.CompileString(pat).IndexString(s, opts...)
}

// Equal reports whether a and b are equivalent.
func (m *Matcher) Equal(a, b []byte) bool {
	_, end := m.Index(a, b, Anchor)
	return end == len(a)
}

// EqualString reports whether a and b are equivalent.
func (m *Matcher) EqualString(a, b string) bool {
	_, end := m.IndexString(a, b, Anchor)
	return end == len(a)
}

// Compile compiles and returns a pattern that can be used for faster searching.
func (m *Matcher) Compile(b []byte) *Pattern {
	p := &Pattern{m: m}
	iter := colltab.Iter{Weighter: m.w}
	for iter.SetInput(b); iter.Next(); {
	}
	p.ce = iter.Elems
	p.deleteEmptyElements()
	return p
}

// CompileString compiles and returns a pattern that can be used for faster
// searching.
func (m *Matcher) CompileString(s string) *Pattern {
	p := &Pattern{m: m}
	iter := colltab.Iter{Weighter: m.w}
	for iter.SetInputString(s); iter.Next(); {
	}
	p.ce = iter.Elems
	p.deleteEmptyElements()
	return p
}

// A Pattern is a compiled search string. It is safe for concurrent use.
type Pattern struct {
	m  *Matcher
	ce []colltab.Elem
}

// Design note (TODO remove):
// The cost of retrieving collation elements for each rune, which is used for
// search as well, is not trivial. Also, algorithms like Boyer-Moore and
// Sunday require some additional precomputing.

// Index reports the start and end position of the first occurrence of p in b
// or -1, -1 if p is not present.
func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) {
	// Pick a large enough buffer such that we likely do not need to allocate
	// and small enough to not cause too much overhead initializing.
	var buf [8]colltab.Elem

	it := &colltab.Iter{
		Weighter: p.m.w,
		Elems:    buf[:0],
	}
	it.SetInput(b)

	var optMask IndexOption
	for _, o := range opts {
		optMask |= o
	}

	switch optMask {
	case 0:
		return p.forwardSearch(it)
	case Anchor:
		return p.anchoredForwardSearch(it)
	case Backwards, anchorBackwards:
		panic("TODO: implement")
	default:
		panic("unrecognized option")
	}
}

// IndexString reports the start and end position of the first occurrence of p
// in s or -1, -1 if p is not present.
func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) {
	// Pick a large enough buffer such that we likely do not need to allocate
	// and small enough to not cause too much overhead initializing.
	var buf [8]colltab.Elem

	it := &colltab.Iter{
		Weighter: p.m.w,
		Elems:    buf[:0],
	}
	it.SetInputString(s)

	var optMask IndexOption
	for _, o := range opts {
		optMask |= o
	}

	switch optMask {
	case 0:
		return p.forwardSearch(it)
	case Anchor:
		return p.anchoredForwardSearch(it)
	case Backwards, anchorBackwards:
		panic("TODO: implement")
	default:
		panic("unrecognized option")
	}
}

// TODO:
// - Maybe IndexAll methods (probably not necessary).
// - Some way to match patterns in a Reader (a bit tricky).
// - Some fold transformer that folds text to comparable text, based on the
//   search options. This is a common technique, though very different from the
//   collation-based design of this package. It has a somewhat different use
//   case, so probably makes sense to support both. Should probably be in a
//   different package, though, as it uses completely different kind of tables
//   (based on norm, cases, width and range tables.)