1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package collate
6
7import (
8	"sort"
9
10	"golang.org/x/text/internal/colltab"
11	"golang.org/x/text/language"
12	"golang.org/x/text/unicode/norm"
13)
14
15// newCollator creates a new collator with default options configured.
16func newCollator(t colltab.Weighter) *Collator {
17	// Initialize a collator with default options.
18	c := &Collator{
19		options: options{
20			ignore: [colltab.NumLevels]bool{
21				colltab.Quaternary: true,
22				colltab.Identity:   true,
23			},
24			f: norm.NFD,
25			t: t,
26		},
27	}
28
29	// TODO: store vt in tags or remove.
30	c.variableTop = t.Top()
31
32	return c
33}
34
35// An Option is used to change the behavior of a Collator. Options override the
36// settings passed through the locale identifier.
37type Option struct {
38	priority int
39	f        func(o *options)
40}
41
42type prioritizedOptions []Option
43
44func (p prioritizedOptions) Len() int {
45	return len(p)
46}
47
48func (p prioritizedOptions) Swap(i, j int) {
49	p[i], p[j] = p[j], p[i]
50}
51
52func (p prioritizedOptions) Less(i, j int) bool {
53	return p[i].priority < p[j].priority
54}
55
56type options struct {
57	// ignore specifies which levels to ignore.
58	ignore [colltab.NumLevels]bool
59
60	// caseLevel is true if there is an additional level of case matching
61	// between the secondary and tertiary levels.
62	caseLevel bool
63
64	// backwards specifies the order of sorting at the secondary level.
65	// This option exists predominantly to support reverse sorting of accents in French.
66	backwards bool
67
68	// numeric specifies whether any sequence of decimal digits (category is Nd)
69	// is sorted at a primary level with its numeric value.
70	// For example, "A-21" < "A-123".
71	// This option is set by wrapping the main Weighter with NewNumericWeighter.
72	numeric bool
73
74	// alternate specifies an alternative handling of variables.
75	alternate alternateHandling
76
77	// variableTop is the largest primary value that is considered to be
78	// variable.
79	variableTop uint32
80
81	t colltab.Weighter
82
83	f norm.Form
84}
85
86func (o *options) setOptions(opts []Option) {
87	sort.Sort(prioritizedOptions(opts))
88	for _, x := range opts {
89		x.f(o)
90	}
91}
92
93// OptionsFromTag extracts the BCP47 collation options from the tag and
94// configures a collator accordingly. These options are set before any other
95// option.
96func OptionsFromTag(t language.Tag) Option {
97	return Option{0, func(o *options) {
98		o.setFromTag(t)
99	}}
100}
101
102func (o *options) setFromTag(t language.Tag) {
103	o.caseLevel = ldmlBool(t, o.caseLevel, "kc")
104	o.backwards = ldmlBool(t, o.backwards, "kb")
105	o.numeric = ldmlBool(t, o.numeric, "kn")
106
107	// Extract settings from the BCP47 u extension.
108	switch t.TypeForKey("ks") { // strength
109	case "level1":
110		o.ignore[colltab.Secondary] = true
111		o.ignore[colltab.Tertiary] = true
112	case "level2":
113		o.ignore[colltab.Tertiary] = true
114	case "level3", "":
115		// The default.
116	case "level4":
117		o.ignore[colltab.Quaternary] = false
118	case "identic":
119		o.ignore[colltab.Quaternary] = false
120		o.ignore[colltab.Identity] = false
121	}
122
123	switch t.TypeForKey("ka") {
124	case "shifted":
125		o.alternate = altShifted
126	// The following two types are not official BCP47, but we support them to
127	// give access to this otherwise hidden functionality. The name blanked is
128	// derived from the LDML name blanked and posix reflects the main use of
129	// the shift-trimmed option.
130	case "blanked":
131		o.alternate = altBlanked
132	case "posix":
133		o.alternate = altShiftTrimmed
134	}
135
136	// TODO: caseFirst ("kf"), reorder ("kr"), and maybe variableTop ("vt").
137
138	// Not used:
139	// - normalization ("kk", not necessary for this implementation)
140	// - hiraganaQuatenary ("kh", obsolete)
141}
142
143func ldmlBool(t language.Tag, old bool, key string) bool {
144	switch t.TypeForKey(key) {
145	case "true":
146		return true
147	case "false":
148		return false
149	default:
150		return old
151	}
152}
153
154var (
155	// IgnoreCase sets case-insensitive comparison.
156	IgnoreCase Option = ignoreCase
157	ignoreCase        = Option{3, ignoreCaseF}
158
159	// IgnoreDiacritics causes diacritical marks to be ignored. ("o" == "ö").
160	IgnoreDiacritics Option = ignoreDiacritics
161	ignoreDiacritics        = Option{3, ignoreDiacriticsF}
162
163	// IgnoreWidth causes full-width characters to match their half-width
164	// equivalents.
165	IgnoreWidth Option = ignoreWidth
166	ignoreWidth        = Option{2, ignoreWidthF}
167
168	// Loose sets the collator to ignore diacritics, case and width.
169	Loose Option = loose
170	loose        = Option{4, looseF}
171
172	// Force ordering if strings are equivalent but not equal.
173	Force Option = force
174	force        = Option{5, forceF}
175
176	// Numeric specifies that numbers should sort numerically ("2" < "12").
177	Numeric Option = numeric
178	numeric        = Option{5, numericF}
179)
180
181func ignoreWidthF(o *options) {
182	o.ignore[colltab.Tertiary] = true
183	o.caseLevel = true
184}
185
186func ignoreDiacriticsF(o *options) {
187	o.ignore[colltab.Secondary] = true
188}
189
190func ignoreCaseF(o *options) {
191	o.ignore[colltab.Tertiary] = true
192	o.caseLevel = false
193}
194
195func looseF(o *options) {
196	ignoreWidthF(o)
197	ignoreDiacriticsF(o)
198	ignoreCaseF(o)
199}
200
201func forceF(o *options) {
202	o.ignore[colltab.Identity] = false
203}
204
205func numericF(o *options) { o.numeric = true }
206
207// Reorder overrides the pre-defined ordering of scripts and character sets.
208func Reorder(s ...string) Option {
209	// TODO: need fractional weights to implement this.
210	panic("TODO: implement")
211}
212
213// TODO: consider making these public again. These options cannot be fully
214// specified in BCP47, so an API interface seems warranted. Still a higher-level
215// interface would be nice (e.g. a POSIX option for enabling altShiftTrimmed)
216
217// alternateHandling identifies the various ways in which variables are handled.
218// A rune with a primary weight lower than the variable top is considered a
219// variable.
220// See https://www.unicode.org/reports/tr10/#Variable_Weighting for details.
221type alternateHandling int
222
223const (
224	// altNonIgnorable turns off special handling of variables.
225	altNonIgnorable alternateHandling = iota
226
227	// altBlanked sets variables and all subsequent primary ignorables to be
228	// ignorable at all levels. This is identical to removing all variables
229	// and subsequent primary ignorables from the input.
230	altBlanked
231
232	// altShifted sets variables to be ignorable for levels one through three and
233	// adds a fourth level based on the values of the ignored levels.
234	altShifted
235
236	// altShiftTrimmed is a slight variant of altShifted that is used to
237	// emulate POSIX.
238	altShiftTrimmed
239)
240