1// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
2
3// Copyright 2011 The Go Authors. All rights reserved.
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file.
6
7package norm
8
9import "unicode/utf8"
10
11const (
12	maxNonStarters = 30
13	// The maximum number of characters needed for a buffer is
14	// maxNonStarters + 1 for the starter + 1 for the GCJ
15	maxBufferSize    = maxNonStarters + 2
16	maxNFCExpansion  = 3  // NFC(0x1D160)
17	maxNFKCExpansion = 18 // NFKC(0xFDFA)
18
19	maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
20)
21
22// ssState is used for reporting the segment state after inserting a rune.
23// It is returned by streamSafe.next.
24type ssState int
25
26const (
27	// Indicates a rune was successfully added to the segment.
28	ssSuccess ssState = iota
29	// Indicates a rune starts a new segment and should not be added.
30	ssStarter
31	// Indicates a rune caused a segment overflow and a CGJ should be inserted.
32	ssOverflow
33)
34
35// streamSafe implements the policy of when a CGJ should be inserted.
36type streamSafe uint8
37
38// first inserts the first rune of a segment. It is a faster version of next if
39// it is known p represents the first rune in a segment.
40func (ss *streamSafe) first(p Properties) {
41	*ss = streamSafe(p.nTrailingNonStarters())
42}
43
44// insert returns a ssState value to indicate whether a rune represented by p
45// can be inserted.
46func (ss *streamSafe) next(p Properties) ssState {
47	if *ss > maxNonStarters {
48		panic("streamSafe was not reset")
49	}
50	n := p.nLeadingNonStarters()
51	if *ss += streamSafe(n); *ss > maxNonStarters {
52		*ss = 0
53		return ssOverflow
54	}
55	// The Stream-Safe Text Processing prescribes that the counting can stop
56	// as soon as a starter is encountered. However, there are some starters,
57	// like Jamo V and T, that can combine with other runes, leaving their
58	// successive non-starters appended to the previous, possibly causing an
59	// overflow. We will therefore consider any rune with a non-zero nLead to
60	// be a non-starter. Note that it always hold that if nLead > 0 then
61	// nLead == nTrail.
62	if n == 0 {
63		*ss = streamSafe(p.nTrailingNonStarters())
64		return ssStarter
65	}
66	return ssSuccess
67}
68
69// backwards is used for checking for overflow and segment starts
70// when traversing a string backwards. Users do not need to call first
71// for the first rune. The state of the streamSafe retains the count of
72// the non-starters loaded.
73func (ss *streamSafe) backwards(p Properties) ssState {
74	if *ss > maxNonStarters {
75		panic("streamSafe was not reset")
76	}
77	c := *ss + streamSafe(p.nTrailingNonStarters())
78	if c > maxNonStarters {
79		return ssOverflow
80	}
81	*ss = c
82	if p.nLeadingNonStarters() == 0 {
83		return ssStarter
84	}
85	return ssSuccess
86}
87
88func (ss streamSafe) isMax() bool {
89	return ss == maxNonStarters
90}
91
92// GraphemeJoiner is inserted after maxNonStarters non-starter runes.
93const GraphemeJoiner = "\u034F"
94
95// reorderBuffer is used to normalize a single segment.  Characters inserted with
96// insert are decomposed and reordered based on CCC. The compose method can
97// be used to recombine characters.  Note that the byte buffer does not hold
98// the UTF-8 characters in order.  Only the rune array is maintained in sorted
99// order. flush writes the resulting segment to a byte array.
100type reorderBuffer struct {
101	rune  [maxBufferSize]Properties // Per character info.
102	byte  [maxByteBufferSize]byte   // UTF-8 buffer. Referenced by runeInfo.pos.
103	nbyte uint8                     // Number or bytes.
104	ss    streamSafe                // For limiting length of non-starter sequence.
105	nrune int                       // Number of runeInfos.
106	f     formInfo
107
108	src      input
109	nsrc     int
110	tmpBytes input
111
112	out    []byte
113	flushF func(*reorderBuffer) bool
114}
115
116func (rb *reorderBuffer) init(f Form, src []byte) {
117	rb.f = *formTable[f]
118	rb.src.setBytes(src)
119	rb.nsrc = len(src)
120	rb.ss = 0
121}
122
123func (rb *reorderBuffer) initString(f Form, src string) {
124	rb.f = *formTable[f]
125	rb.src.setString(src)
126	rb.nsrc = len(src)
127	rb.ss = 0
128}
129
130func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
131	rb.out = out
132	rb.flushF = f
133}
134
135// reset discards all characters from the buffer.
136func (rb *reorderBuffer) reset() {
137	rb.nrune = 0
138	rb.nbyte = 0
139}
140
141func (rb *reorderBuffer) doFlush() bool {
142	if rb.f.composing {
143		rb.compose()
144	}
145	res := rb.flushF(rb)
146	rb.reset()
147	return res
148}
149
150// appendFlush appends the normalized segment to rb.out.
151func appendFlush(rb *reorderBuffer) bool {
152	for i := 0; i < rb.nrune; i++ {
153		start := rb.rune[i].pos
154		end := start + rb.rune[i].size
155		rb.out = append(rb.out, rb.byte[start:end]...)
156	}
157	return true
158}
159
160// flush appends the normalized segment to out and resets rb.
161func (rb *reorderBuffer) flush(out []byte) []byte {
162	for i := 0; i < rb.nrune; i++ {
163		start := rb.rune[i].pos
164		end := start + rb.rune[i].size
165		out = append(out, rb.byte[start:end]...)
166	}
167	rb.reset()
168	return out
169}
170
171// flushCopy copies the normalized segment to buf and resets rb.
172// It returns the number of bytes written to buf.
173func (rb *reorderBuffer) flushCopy(buf []byte) int {
174	p := 0
175	for i := 0; i < rb.nrune; i++ {
176		runep := rb.rune[i]
177		p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
178	}
179	rb.reset()
180	return p
181}
182
183// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
184// It returns false if the buffer is not large enough to hold the rune.
185// It is used internally by insert and insertString only.
186func (rb *reorderBuffer) insertOrdered(info Properties) {
187	n := rb.nrune
188	b := rb.rune[:]
189	cc := info.ccc
190	if cc > 0 {
191		// Find insertion position + move elements to make room.
192		for ; n > 0; n-- {
193			if b[n-1].ccc <= cc {
194				break
195			}
196			b[n] = b[n-1]
197		}
198	}
199	rb.nrune += 1
200	pos := uint8(rb.nbyte)
201	rb.nbyte += utf8.UTFMax
202	info.pos = pos
203	b[n] = info
204}
205
206// insertErr is an error code returned by insert. Using this type instead
207// of error improves performance up to 20% for many of the benchmarks.
208type insertErr int
209
210const (
211	iSuccess insertErr = -iota
212	iShortDst
213	iShortSrc
214)
215
216// insertFlush inserts the given rune in the buffer ordered by CCC.
217// If a decomposition with multiple segments are encountered, they leading
218// ones are flushed.
219// It returns a non-zero error code if the rune was not inserted.
220func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr {
221	if rune := src.hangul(i); rune != 0 {
222		rb.decomposeHangul(rune)
223		return iSuccess
224	}
225	if info.hasDecomposition() {
226		return rb.insertDecomposed(info.Decomposition())
227	}
228	rb.insertSingle(src, i, info)
229	return iSuccess
230}
231
232// insertUnsafe inserts the given rune in the buffer ordered by CCC.
233// It is assumed there is sufficient space to hold the runes. It is the
234// responsibility of the caller to ensure this. This can be done by checking
235// the state returned by the streamSafe type.
236func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
237	if rune := src.hangul(i); rune != 0 {
238		rb.decomposeHangul(rune)
239	}
240	if info.hasDecomposition() {
241		// TODO: inline.
242		rb.insertDecomposed(info.Decomposition())
243	} else {
244		rb.insertSingle(src, i, info)
245	}
246}
247
248// insertDecomposed inserts an entry in to the reorderBuffer for each rune
249// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
250// It flushes the buffer on each new segment start.
251func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
252	rb.tmpBytes.setBytes(dcomp)
253	// As the streamSafe accounting already handles the counting for modifiers,
254	// we don't have to call next. However, we do need to keep the accounting
255	// intact when flushing the buffer.
256	for i := 0; i < len(dcomp); {
257		info := rb.f.info(rb.tmpBytes, i)
258		if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
259			return iShortDst
260		}
261		i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)])
262		rb.insertOrdered(info)
263	}
264	return iSuccess
265}
266
267// insertSingle inserts an entry in the reorderBuffer for the rune at
268// position i. info is the runeInfo for the rune at position i.
269func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) {
270	src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size))
271	rb.insertOrdered(info)
272}
273
274// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
275func (rb *reorderBuffer) insertCGJ() {
276	rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})
277}
278
279// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
280func (rb *reorderBuffer) appendRune(r rune) {
281	bn := rb.nbyte
282	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
283	rb.nbyte += utf8.UTFMax
284	rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
285	rb.nrune++
286}
287
288// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
289func (rb *reorderBuffer) assignRune(pos int, r rune) {
290	bn := rb.rune[pos].pos
291	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
292	rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
293}
294
295// runeAt returns the rune at position n. It is used for Hangul and recomposition.
296func (rb *reorderBuffer) runeAt(n int) rune {
297	inf := rb.rune[n]
298	r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
299	return r
300}
301
302// bytesAt returns the UTF-8 encoding of the rune at position n.
303// It is used for Hangul and recomposition.
304func (rb *reorderBuffer) bytesAt(n int) []byte {
305	inf := rb.rune[n]
306	return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
307}
308
309// For Hangul we combine algorithmically, instead of using tables.
310const (
311	hangulBase  = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
312	hangulBase0 = 0xEA
313	hangulBase1 = 0xB0
314	hangulBase2 = 0x80
315
316	hangulEnd  = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
317	hangulEnd0 = 0xED
318	hangulEnd1 = 0x9E
319	hangulEnd2 = 0xA4
320
321	jamoLBase  = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
322	jamoLBase0 = 0xE1
323	jamoLBase1 = 0x84
324	jamoLEnd   = 0x1113
325	jamoVBase  = 0x1161
326	jamoVEnd   = 0x1176
327	jamoTBase  = 0x11A7
328	jamoTEnd   = 0x11C3
329
330	jamoTCount   = 28
331	jamoVCount   = 21
332	jamoVTCount  = 21 * 28
333	jamoLVTCount = 19 * 21 * 28
334)
335
336const hangulUTF8Size = 3
337
338func isHangul(b []byte) bool {
339	if len(b) < hangulUTF8Size {
340		return false
341	}
342	b0 := b[0]
343	if b0 < hangulBase0 {
344		return false
345	}
346	b1 := b[1]
347	switch {
348	case b0 == hangulBase0:
349		return b1 >= hangulBase1
350	case b0 < hangulEnd0:
351		return true
352	case b0 > hangulEnd0:
353		return false
354	case b1 < hangulEnd1:
355		return true
356	}
357	return b1 == hangulEnd1 && b[2] < hangulEnd2
358}
359
360func isHangulString(b string) bool {
361	if len(b) < hangulUTF8Size {
362		return false
363	}
364	b0 := b[0]
365	if b0 < hangulBase0 {
366		return false
367	}
368	b1 := b[1]
369	switch {
370	case b0 == hangulBase0:
371		return b1 >= hangulBase1
372	case b0 < hangulEnd0:
373		return true
374	case b0 > hangulEnd0:
375		return false
376	case b1 < hangulEnd1:
377		return true
378	}
379	return b1 == hangulEnd1 && b[2] < hangulEnd2
380}
381
382// Caller must ensure len(b) >= 2.
383func isJamoVT(b []byte) bool {
384	// True if (rune & 0xff00) == jamoLBase
385	return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
386}
387
388func isHangulWithoutJamoT(b []byte) bool {
389	c, _ := utf8.DecodeRune(b)
390	c -= hangulBase
391	return c < jamoLVTCount && c%jamoTCount == 0
392}
393
394// decomposeHangul writes the decomposed Hangul to buf and returns the number
395// of bytes written.  len(buf) should be at least 9.
396func decomposeHangul(buf []byte, r rune) int {
397	const JamoUTF8Len = 3
398	r -= hangulBase
399	x := r % jamoTCount
400	r /= jamoTCount
401	utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
402	utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
403	if x != 0 {
404		utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
405		return 3 * JamoUTF8Len
406	}
407	return 2 * JamoUTF8Len
408}
409
410// decomposeHangul algorithmically decomposes a Hangul rune into
411// its Jamo components.
412// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
413func (rb *reorderBuffer) decomposeHangul(r rune) {
414	r -= hangulBase
415	x := r % jamoTCount
416	r /= jamoTCount
417	rb.appendRune(jamoLBase + r/jamoVCount)
418	rb.appendRune(jamoVBase + r%jamoVCount)
419	if x != 0 {
420		rb.appendRune(jamoTBase + x)
421	}
422}
423
424// combineHangul algorithmically combines Jamo character components into Hangul.
425// See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
426func (rb *reorderBuffer) combineHangul(s, i, k int) {
427	b := rb.rune[:]
428	bn := rb.nrune
429	for ; i < bn; i++ {
430		cccB := b[k-1].ccc
431		cccC := b[i].ccc
432		if cccB == 0 {
433			s = k - 1
434		}
435		if s != k-1 && cccB >= cccC {
436			// b[i] is blocked by greater-equal cccX below it
437			b[k] = b[i]
438			k++
439		} else {
440			l := rb.runeAt(s) // also used to compare to hangulBase
441			v := rb.runeAt(i) // also used to compare to jamoT
442			switch {
443			case jamoLBase <= l && l < jamoLEnd &&
444				jamoVBase <= v && v < jamoVEnd:
445				// 11xx plus 116x to LV
446				rb.assignRune(s, hangulBase+
447					(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
448			case hangulBase <= l && l < hangulEnd &&
449				jamoTBase < v && v < jamoTEnd &&
450				((l-hangulBase)%jamoTCount) == 0:
451				// ACxx plus 11Ax to LVT
452				rb.assignRune(s, l+v-jamoTBase)
453			default:
454				b[k] = b[i]
455				k++
456			}
457		}
458	}
459	rb.nrune = k
460}
461
462// compose recombines the runes in the buffer.
463// It should only be used to recompose a single segment, as it will not
464// handle alternations between Hangul and non-Hangul characters correctly.
465func (rb *reorderBuffer) compose() {
466	// UAX #15, section X5 , including Corrigendum #5
467	// "In any character sequence beginning with starter S, a character C is
468	//  blocked from S if and only if there is some character B between S
469	//  and C, and either B is a starter or it has the same or higher
470	//  combining class as C."
471	bn := rb.nrune
472	if bn == 0 {
473		return
474	}
475	k := 1
476	b := rb.rune[:]
477	for s, i := 0, 1; i < bn; i++ {
478		if isJamoVT(rb.bytesAt(i)) {
479			// Redo from start in Hangul mode. Necessary to support
480			// U+320E..U+321E in NFKC mode.
481			rb.combineHangul(s, i, k)
482			return
483		}
484		ii := b[i]
485		// We can only use combineForward as a filter if we later
486		// get the info for the combined character. This is more
487		// expensive than using the filter. Using combinesBackward()
488		// is safe.
489		if ii.combinesBackward() {
490			cccB := b[k-1].ccc
491			cccC := ii.ccc
492			blocked := false // b[i] blocked by starter or greater or equal CCC?
493			if cccB == 0 {
494				s = k - 1
495			} else {
496				blocked = s != k-1 && cccB >= cccC
497			}
498			if !blocked {
499				combined := combine(rb.runeAt(s), rb.runeAt(i))
500				if combined != 0 {
501					rb.assignRune(s, combined)
502					continue
503				}
504			}
505		}
506		b[k] = b[i]
507		k++
508	}
509	rb.nrune = k
510}
511