1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package colltab
6
7import (
8	"unicode"
9	"unicode/utf8"
10)
11
12// NewNumericWeighter wraps w to replace individual digits to sort based on their
13// numeric value.
14//
15// Weighter w must have a free primary weight after the primary weight for 9.
16// If this is not the case, numeric value will sort at the same primary level
17// as the first primary sorting after 9.
18func NewNumericWeighter(w Weighter) Weighter {
19	getElem := func(s string) Elem {
20		elems, _ := w.AppendNextString(nil, s)
21		return elems[0]
22	}
23	nine := getElem("9")
24
25	// Numbers should order before zero, but the DUCET has no room for this.
26	// TODO: move before zero once we use fractional collation elements.
27	ns, _ := MakeElem(nine.Primary()+1, nine.Secondary(), int(nine.Tertiary()), 0)
28
29	return &numericWeighter{
30		Weighter: w,
31
32		// We assume that w sorts digits of different kinds in order of numeric
33		// value and that the tertiary weight order is preserved.
34		//
35		// TODO: evaluate whether it is worth basing the ranges on the Elem
36		// encoding itself once the move to fractional weights is complete.
37		zero:          getElem("0"),
38		zeroSpecialLo: getElem("0"), // U+FF10 FULLWIDTH DIGIT ZERO
39		zeroSpecialHi: getElem("₀"), // U+2080 SUBSCRIPT ZERO
40		nine:          nine,
41		nineSpecialHi: getElem("₉"), // U+2089 SUBSCRIPT NINE
42		numberStart:   ns,
43	}
44}
45
46// A numericWeighter translates a stream of digits into a stream of weights
47// representing the numeric value.
48type numericWeighter struct {
49	Weighter
50
51	// The Elems below all demarcate boundaries of specific ranges. With the
52	// current element encoding digits are in two ranges: normal (default
53	// tertiary value) and special. For most languages, digits have collation
54	// elements in the normal range.
55	//
56	// Note: the range tests are very specific for the element encoding used by
57	// this implementation. The tests in collate_test.go are designed to fail
58	// if this code is not updated when an encoding has changed.
59
60	zero          Elem // normal digit zero
61	zeroSpecialLo Elem // special digit zero, low tertiary value
62	zeroSpecialHi Elem // special digit zero, high tertiary value
63	nine          Elem // normal digit nine
64	nineSpecialHi Elem // special digit nine
65	numberStart   Elem
66}
67
68// AppendNext calls the namesake of the underlying weigher, but replaces single
69// digits with weights representing their value.
70func (nw *numericWeighter) AppendNext(buf []Elem, s []byte) (ce []Elem, n int) {
71	ce, n = nw.Weighter.AppendNext(buf, s)
72	nc := numberConverter{
73		elems: buf,
74		w:     nw,
75		b:     s,
76	}
77	isZero, ok := nc.checkNextDigit(ce)
78	if !ok {
79		return ce, n
80	}
81	// ce might have been grown already, so take it instead of buf.
82	nc.init(ce, len(buf), isZero)
83	for n < len(s) {
84		ce, sz := nw.Weighter.AppendNext(nc.elems, s[n:])
85		nc.b = s
86		n += sz
87		if !nc.update(ce) {
88			break
89		}
90	}
91	return nc.result(), n
92}
93
94// AppendNextString calls the namesake of the underlying weigher, but replaces
95// single digits with weights representing their value.
96func (nw *numericWeighter) AppendNextString(buf []Elem, s string) (ce []Elem, n int) {
97	ce, n = nw.Weighter.AppendNextString(buf, s)
98	nc := numberConverter{
99		elems: buf,
100		w:     nw,
101		s:     s,
102	}
103	isZero, ok := nc.checkNextDigit(ce)
104	if !ok {
105		return ce, n
106	}
107	nc.init(ce, len(buf), isZero)
108	for n < len(s) {
109		ce, sz := nw.Weighter.AppendNextString(nc.elems, s[n:])
110		nc.s = s
111		n += sz
112		if !nc.update(ce) {
113			break
114		}
115	}
116	return nc.result(), n
117}
118
119type numberConverter struct {
120	w *numericWeighter
121
122	elems    []Elem
123	nDigits  int
124	lenIndex int
125
126	s string // set if the input was of type string
127	b []byte // set if the input was of type []byte
128}
129
130// init completes initialization of a numberConverter and prepares it for adding
131// more digits. elems is assumed to have a digit starting at oldLen.
132func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) {
133	// Insert a marker indicating the start of a number and a placeholder
134	// for the number of digits.
135	if isZero {
136		elems = append(elems[:oldLen], nc.w.numberStart, 0)
137	} else {
138		elems = append(elems, 0, 0)
139		copy(elems[oldLen+2:], elems[oldLen:])
140		elems[oldLen] = nc.w.numberStart
141		elems[oldLen+1] = 0
142
143		nc.nDigits = 1
144	}
145	nc.elems = elems
146	nc.lenIndex = oldLen + 1
147}
148
149// checkNextDigit reports whether bufNew adds a single digit relative to the old
150// buffer. If it does, it also reports whether this digit is zero.
151func (nc *numberConverter) checkNextDigit(bufNew []Elem) (isZero, ok bool) {
152	if len(nc.elems) >= len(bufNew) {
153		return false, false
154	}
155	e := bufNew[len(nc.elems)]
156	if e < nc.w.zeroSpecialLo || nc.w.nine < e {
157		// Not a number.
158		return false, false
159	}
160	if e < nc.w.zero {
161		if e > nc.w.nineSpecialHi {
162			// Not a number.
163			return false, false
164		}
165		if !nc.isDigit() {
166			return false, false
167		}
168		isZero = e <= nc.w.zeroSpecialHi
169	} else {
170		// This is the common case if we encounter a digit.
171		isZero = e == nc.w.zero
172	}
173	// Test the remaining added collation elements have a zero primary value.
174	if n := len(bufNew) - len(nc.elems); n > 1 {
175		for i := len(nc.elems) + 1; i < len(bufNew); i++ {
176			if bufNew[i].Primary() != 0 {
177				return false, false
178			}
179		}
180		// In some rare cases, collation elements will encode runes in
181		// unicode.No as a digit. For example Ethiopic digits (U+1369 - U+1371)
182		// are not in Nd. Also some digits that clearly belong in unicode.No,
183		// like U+0C78 TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR, have
184		// collation elements indistinguishable from normal digits.
185		// Unfortunately, this means we need to make this check for nearly all
186		// non-Latin digits.
187		//
188		// TODO: check the performance impact and find something better if it is
189		// an issue.
190		if !nc.isDigit() {
191			return false, false
192		}
193	}
194	return isZero, true
195}
196
197func (nc *numberConverter) isDigit() bool {
198	if nc.b != nil {
199		r, _ := utf8.DecodeRune(nc.b)
200		return unicode.In(r, unicode.Nd)
201	}
202	r, _ := utf8.DecodeRuneInString(nc.s)
203	return unicode.In(r, unicode.Nd)
204}
205
206// We currently support a maximum of about 2M digits (the number of primary
207// values). Such numbers will compare correctly against small numbers, but their
208// comparison against other large numbers is undefined.
209//
210// TODO: define a proper fallback, such as comparing large numbers textually or
211// actually allowing numbers of unlimited length.
212//
213// TODO: cap this to a lower number (like 100) and maybe allow a larger number
214// in an option?
215const maxDigits = 1<<maxPrimaryBits - 1
216
217func (nc *numberConverter) update(elems []Elem) bool {
218	isZero, ok := nc.checkNextDigit(elems)
219	if nc.nDigits == 0 && isZero {
220		return true
221	}
222	nc.elems = elems
223	if !ok {
224		return false
225	}
226	nc.nDigits++
227	return nc.nDigits < maxDigits
228}
229
230// result fills in the length element for the digit sequence and returns the
231// completed collation elements.
232func (nc *numberConverter) result() []Elem {
233	e, _ := MakeElem(nc.nDigits, defaultSecondary, defaultTertiary, 0)
234	nc.elems[nc.lenIndex] = e
235	return nc.elems
236}
237