1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Note: the file data_test.go that is generated should not be checked in.
6//go:generate go run maketables.go triegen.go
7//go:generate go test -tags test
8
9// Package norm contains types and functions for normalizing Unicode strings.
10package norm // import "golang.org/x/text/unicode/norm"
11
12import (
13	"unicode/utf8"
14
15	"golang.org/x/text/transform"
16)
17
18// A Form denotes a canonical representation of Unicode code points.
19// The Unicode-defined normalization and equivalence forms are:
20//
21//   NFC   Unicode Normalization Form C
22//   NFD   Unicode Normalization Form D
23//   NFKC  Unicode Normalization Form KC
24//   NFKD  Unicode Normalization Form KD
25//
26// For a Form f, this documentation uses the notation f(x) to mean
27// the bytes or string x converted to the given form.
28// A position n in x is called a boundary if conversion to the form can
29// proceed independently on both sides:
30//   f(x) == append(f(x[0:n]), f(x[n:])...)
31//
32// References: https://unicode.org/reports/tr15/ and
33// https://unicode.org/notes/tn5/.
34type Form int
35
36const (
37	NFC Form = iota
38	NFD
39	NFKC
40	NFKD
41)
42
43// Bytes returns f(b). May return b if f(b) = b.
44func (f Form) Bytes(b []byte) []byte {
45	src := inputBytes(b)
46	ft := formTable[f]
47	n, ok := ft.quickSpan(src, 0, len(b), true)
48	if ok {
49		return b
50	}
51	out := make([]byte, n, len(b))
52	copy(out, b[0:n])
53	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
54	return doAppendInner(&rb, n)
55}
56
57// String returns f(s).
58func (f Form) String(s string) string {
59	src := inputString(s)
60	ft := formTable[f]
61	n, ok := ft.quickSpan(src, 0, len(s), true)
62	if ok {
63		return s
64	}
65	out := make([]byte, n, len(s))
66	copy(out, s[0:n])
67	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
68	return string(doAppendInner(&rb, n))
69}
70
71// IsNormal returns true if b == f(b).
72func (f Form) IsNormal(b []byte) bool {
73	src := inputBytes(b)
74	ft := formTable[f]
75	bp, ok := ft.quickSpan(src, 0, len(b), true)
76	if ok {
77		return true
78	}
79	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
80	rb.setFlusher(nil, cmpNormalBytes)
81	for bp < len(b) {
82		rb.out = b[bp:]
83		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
84			return false
85		}
86		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
87	}
88	return true
89}
90
91func cmpNormalBytes(rb *reorderBuffer) bool {
92	b := rb.out
93	for i := 0; i < rb.nrune; i++ {
94		info := rb.rune[i]
95		if int(info.size) > len(b) {
96			return false
97		}
98		p := info.pos
99		pe := p + info.size
100		for ; p < pe; p++ {
101			if b[0] != rb.byte[p] {
102				return false
103			}
104			b = b[1:]
105		}
106	}
107	return true
108}
109
110// IsNormalString returns true if s == f(s).
111func (f Form) IsNormalString(s string) bool {
112	src := inputString(s)
113	ft := formTable[f]
114	bp, ok := ft.quickSpan(src, 0, len(s), true)
115	if ok {
116		return true
117	}
118	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
119	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
120		for i := 0; i < rb.nrune; i++ {
121			info := rb.rune[i]
122			if bp+int(info.size) > len(s) {
123				return false
124			}
125			p := info.pos
126			pe := p + info.size
127			for ; p < pe; p++ {
128				if s[bp] != rb.byte[p] {
129					return false
130				}
131				bp++
132			}
133		}
134		return true
135	})
136	for bp < len(s) {
137		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
138			return false
139		}
140		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
141	}
142	return true
143}
144
145// patchTail fixes a case where a rune may be incorrectly normalized
146// if it is followed by illegal continuation bytes. It returns the
147// patched buffer and whether the decomposition is still in progress.
148func patchTail(rb *reorderBuffer) bool {
149	info, p := lastRuneStart(&rb.f, rb.out)
150	if p == -1 || info.size == 0 {
151		return true
152	}
153	end := p + int(info.size)
154	extra := len(rb.out) - end
155	if extra > 0 {
156		// Potentially allocating memory. However, this only
157		// happens with ill-formed UTF-8.
158		x := make([]byte, 0)
159		x = append(x, rb.out[len(rb.out)-extra:]...)
160		rb.out = rb.out[:end]
161		decomposeToLastBoundary(rb)
162		rb.doFlush()
163		rb.out = append(rb.out, x...)
164		return false
165	}
166	buf := rb.out[p:]
167	rb.out = rb.out[:p]
168	decomposeToLastBoundary(rb)
169	if s := rb.ss.next(info); s == ssStarter {
170		rb.doFlush()
171		rb.ss.first(info)
172	} else if s == ssOverflow {
173		rb.doFlush()
174		rb.insertCGJ()
175		rb.ss = 0
176	}
177	rb.insertUnsafe(inputBytes(buf), 0, info)
178	return true
179}
180
181func appendQuick(rb *reorderBuffer, i int) int {
182	if rb.nsrc == i {
183		return i
184	}
185	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
186	rb.out = rb.src.appendSlice(rb.out, i, end)
187	return end
188}
189
190// Append returns f(append(out, b...)).
191// The buffer out must be nil, empty, or equal to f(out).
192func (f Form) Append(out []byte, src ...byte) []byte {
193	return f.doAppend(out, inputBytes(src), len(src))
194}
195
196func (f Form) doAppend(out []byte, src input, n int) []byte {
197	if n == 0 {
198		return out
199	}
200	ft := formTable[f]
201	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
202	if len(out) == 0 {
203		p, _ := ft.quickSpan(src, 0, n, true)
204		out = src.appendSlice(out, 0, p)
205		if p == n {
206			return out
207		}
208		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
209		return doAppendInner(&rb, p)
210	}
211	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
212	return doAppend(&rb, out, 0)
213}
214
215func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
216	rb.setFlusher(out, appendFlush)
217	src, n := rb.src, rb.nsrc
218	doMerge := len(out) > 0
219	if q := src.skipContinuationBytes(p); q > p {
220		// Move leading non-starters to destination.
221		rb.out = src.appendSlice(rb.out, p, q)
222		p = q
223		doMerge = patchTail(rb)
224	}
225	fd := &rb.f
226	if doMerge {
227		var info Properties
228		if p < n {
229			info = fd.info(src, p)
230			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
231				if p == 0 {
232					decomposeToLastBoundary(rb)
233				}
234				p = decomposeSegment(rb, p, true)
235			}
236		}
237		if info.size == 0 {
238			rb.doFlush()
239			// Append incomplete UTF-8 encoding.
240			return src.appendSlice(rb.out, p, n)
241		}
242		if rb.nrune > 0 {
243			return doAppendInner(rb, p)
244		}
245	}
246	p = appendQuick(rb, p)
247	return doAppendInner(rb, p)
248}
249
250func doAppendInner(rb *reorderBuffer, p int) []byte {
251	for n := rb.nsrc; p < n; {
252		p = decomposeSegment(rb, p, true)
253		p = appendQuick(rb, p)
254	}
255	return rb.out
256}
257
258// AppendString returns f(append(out, []byte(s))).
259// The buffer out must be nil, empty, or equal to f(out).
260func (f Form) AppendString(out []byte, src string) []byte {
261	return f.doAppend(out, inputString(src), len(src))
262}
263
264// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
265// It is not guaranteed to return the largest such n.
266func (f Form) QuickSpan(b []byte) int {
267	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
268	return n
269}
270
271// Span implements transform.SpanningTransformer. It returns a boundary n such
272// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
273func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
274	n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
275	if n < len(b) {
276		if !ok {
277			err = transform.ErrEndOfSpan
278		} else {
279			err = transform.ErrShortSrc
280		}
281	}
282	return n, err
283}
284
285// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
286// It is not guaranteed to return the largest such n.
287func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
288	n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
289	if n < len(s) {
290		if !ok {
291			err = transform.ErrEndOfSpan
292		} else {
293			err = transform.ErrShortSrc
294		}
295	}
296	return n, err
297}
298
299// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
300// whether any non-normalized parts were found. If atEOF is false, n will
301// not point past the last segment if this segment might be become
302// non-normalized by appending other runes.
303func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
304	var lastCC uint8
305	ss := streamSafe(0)
306	lastSegStart := i
307	for n = end; i < n; {
308		if j := src.skipASCII(i, n); i != j {
309			i = j
310			lastSegStart = i - 1
311			lastCC = 0
312			ss = 0
313			continue
314		}
315		info := f.info(src, i)
316		if info.size == 0 {
317			if atEOF {
318				// include incomplete runes
319				return n, true
320			}
321			return lastSegStart, true
322		}
323		// This block needs to be before the next, because it is possible to
324		// have an overflow for runes that are starters (e.g. with U+FF9E).
325		switch ss.next(info) {
326		case ssStarter:
327			lastSegStart = i
328		case ssOverflow:
329			return lastSegStart, false
330		case ssSuccess:
331			if lastCC > info.ccc {
332				return lastSegStart, false
333			}
334		}
335		if f.composing {
336			if !info.isYesC() {
337				break
338			}
339		} else {
340			if !info.isYesD() {
341				break
342			}
343		}
344		lastCC = info.ccc
345		i += int(info.size)
346	}
347	if i == n {
348		if !atEOF {
349			n = lastSegStart
350		}
351		return n, true
352	}
353	return lastSegStart, false
354}
355
356// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
357// It is not guaranteed to return the largest such n.
358func (f Form) QuickSpanString(s string) int {
359	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
360	return n
361}
362
363// FirstBoundary returns the position i of the first boundary in b
364// or -1 if b contains no boundary.
365func (f Form) FirstBoundary(b []byte) int {
366	return f.firstBoundary(inputBytes(b), len(b))
367}
368
369func (f Form) firstBoundary(src input, nsrc int) int {
370	i := src.skipContinuationBytes(0)
371	if i >= nsrc {
372		return -1
373	}
374	fd := formTable[f]
375	ss := streamSafe(0)
376	// We should call ss.first here, but we can't as the first rune is
377	// skipped already. This means FirstBoundary can't really determine
378	// CGJ insertion points correctly. Luckily it doesn't have to.
379	for {
380		info := fd.info(src, i)
381		if info.size == 0 {
382			return -1
383		}
384		if s := ss.next(info); s != ssSuccess {
385			return i
386		}
387		i += int(info.size)
388		if i >= nsrc {
389			if !info.BoundaryAfter() && !ss.isMax() {
390				return -1
391			}
392			return nsrc
393		}
394	}
395}
396
397// FirstBoundaryInString returns the position i of the first boundary in s
398// or -1 if s contains no boundary.
399func (f Form) FirstBoundaryInString(s string) int {
400	return f.firstBoundary(inputString(s), len(s))
401}
402
403// NextBoundary reports the index of the boundary between the first and next
404// segment in b or -1 if atEOF is false and there are not enough bytes to
405// determine this boundary.
406func (f Form) NextBoundary(b []byte, atEOF bool) int {
407	return f.nextBoundary(inputBytes(b), len(b), atEOF)
408}
409
410// NextBoundaryInString reports the index of the boundary between the first and
411// next segment in b or -1 if atEOF is false and there are not enough bytes to
412// determine this boundary.
413func (f Form) NextBoundaryInString(s string, atEOF bool) int {
414	return f.nextBoundary(inputString(s), len(s), atEOF)
415}
416
417func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
418	if nsrc == 0 {
419		if atEOF {
420			return 0
421		}
422		return -1
423	}
424	fd := formTable[f]
425	info := fd.info(src, 0)
426	if info.size == 0 {
427		if atEOF {
428			return 1
429		}
430		return -1
431	}
432	ss := streamSafe(0)
433	ss.first(info)
434
435	for i := int(info.size); i < nsrc; i += int(info.size) {
436		info = fd.info(src, i)
437		if info.size == 0 {
438			if atEOF {
439				return i
440			}
441			return -1
442		}
443		// TODO: Using streamSafe to determine the boundary isn't the same as
444		// using BoundaryBefore. Determine which should be used.
445		if s := ss.next(info); s != ssSuccess {
446			return i
447		}
448	}
449	if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
450		return -1
451	}
452	return nsrc
453}
454
455// LastBoundary returns the position i of the last boundary in b
456// or -1 if b contains no boundary.
457func (f Form) LastBoundary(b []byte) int {
458	return lastBoundary(formTable[f], b)
459}
460
461func lastBoundary(fd *formInfo, b []byte) int {
462	i := len(b)
463	info, p := lastRuneStart(fd, b)
464	if p == -1 {
465		return -1
466	}
467	if info.size == 0 { // ends with incomplete rune
468		if p == 0 { // starts with incomplete rune
469			return -1
470		}
471		i = p
472		info, p = lastRuneStart(fd, b[:i])
473		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
474			return i
475		}
476	}
477	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
478		return i
479	}
480	if info.BoundaryAfter() {
481		return i
482	}
483	ss := streamSafe(0)
484	v := ss.backwards(info)
485	for i = p; i >= 0 && v != ssStarter; i = p {
486		info, p = lastRuneStart(fd, b[:i])
487		if v = ss.backwards(info); v == ssOverflow {
488			break
489		}
490		if p+int(info.size) != i {
491			if p == -1 { // no boundary found
492				return -1
493			}
494			return i // boundary after an illegal UTF-8 encoding
495		}
496	}
497	return i
498}
499
500// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
501// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
502// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
503func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
504	// Force one character to be consumed.
505	info := rb.f.info(rb.src, sp)
506	if info.size == 0 {
507		return 0
508	}
509	if s := rb.ss.next(info); s == ssStarter {
510		// TODO: this could be removed if we don't support merging.
511		if rb.nrune > 0 {
512			goto end
513		}
514	} else if s == ssOverflow {
515		rb.insertCGJ()
516		goto end
517	}
518	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
519		return int(err)
520	}
521	for {
522		sp += int(info.size)
523		if sp >= rb.nsrc {
524			if !atEOF && !info.BoundaryAfter() {
525				return int(iShortSrc)
526			}
527			break
528		}
529		info = rb.f.info(rb.src, sp)
530		if info.size == 0 {
531			if !atEOF {
532				return int(iShortSrc)
533			}
534			break
535		}
536		if s := rb.ss.next(info); s == ssStarter {
537			break
538		} else if s == ssOverflow {
539			rb.insertCGJ()
540			break
541		}
542		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
543			return int(err)
544		}
545	}
546end:
547	if !rb.doFlush() {
548		return int(iShortDst)
549	}
550	return sp
551}
552
553// lastRuneStart returns the runeInfo and position of the last
554// rune in buf or the zero runeInfo and -1 if no rune was found.
555func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
556	p := len(buf) - 1
557	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
558	}
559	if p < 0 {
560		return Properties{}, -1
561	}
562	return fd.info(inputBytes(buf), p), p
563}
564
565// decomposeToLastBoundary finds an open segment at the end of the buffer
566// and scans it into rb. Returns the buffer minus the last segment.
567func decomposeToLastBoundary(rb *reorderBuffer) {
568	fd := &rb.f
569	info, i := lastRuneStart(fd, rb.out)
570	if int(info.size) != len(rb.out)-i {
571		// illegal trailing continuation bytes
572		return
573	}
574	if info.BoundaryAfter() {
575		return
576	}
577	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
578	padd := 0
579	ss := streamSafe(0)
580	p := len(rb.out)
581	for {
582		add[padd] = info
583		v := ss.backwards(info)
584		if v == ssOverflow {
585			// Note that if we have an overflow, it the string we are appending to
586			// is not correctly normalized. In this case the behavior is undefined.
587			break
588		}
589		padd++
590		p -= int(info.size)
591		if v == ssStarter || p < 0 {
592			break
593		}
594		info, i = lastRuneStart(fd, rb.out[:p])
595		if int(info.size) != p-i {
596			break
597		}
598	}
599	rb.ss = ss
600	// Copy bytes for insertion as we may need to overwrite rb.out.
601	var buf [maxBufferSize * utf8.UTFMax]byte
602	cp := buf[:copy(buf[:], rb.out[p:])]
603	rb.out = rb.out[:p]
604	for padd--; padd >= 0; padd-- {
605		info = add[padd]
606		rb.insertUnsafe(inputBytes(cp), 0, info)
607		cp = cp[info.size:]
608	}
609}
610