1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build icu
6// +build icu
7
8package main
9
10/*
11#cgo LDFLAGS: -licui18n -licuuc
12#include <stdlib.h>
13#include <unicode/ucol.h>
14#include <unicode/uiter.h>
15#include <unicode/utypes.h>
16*/
17import "C"
18import (
19	"fmt"
20	"log"
21	"unicode/utf16"
22	"unicode/utf8"
23	"unsafe"
24)
25
26func init() {
27	AddFactory(CollatorFactory{"icu", newUTF16,
28		"Main ICU collator, using native strings."})
29	AddFactory(CollatorFactory{"icu8", newUTF8iter,
30		"ICU collator using ICU iterators to process UTF8."})
31	AddFactory(CollatorFactory{"icu16", newUTF8conv,
32		"ICU collation by first converting UTF8 to UTF16."})
33}
34
35func icuCharP(s []byte) *C.char {
36	return (*C.char)(unsafe.Pointer(&s[0]))
37}
38
39func icuUInt8P(s []byte) *C.uint8_t {
40	return (*C.uint8_t)(unsafe.Pointer(&s[0]))
41}
42
43func icuUCharP(s []uint16) *C.UChar {
44	return (*C.UChar)(unsafe.Pointer(&s[0]))
45}
46func icuULen(s []uint16) C.int32_t {
47	return C.int32_t(len(s))
48}
49func icuSLen(s []byte) C.int32_t {
50	return C.int32_t(len(s))
51}
52
53// icuCollator implements a Collator based on ICU.
54type icuCollator struct {
55	loc    *C.char
56	col    *C.UCollator
57	keyBuf []byte
58}
59
60const growBufSize = 10 * 1024 * 1024
61
62func (c *icuCollator) init(locale string) error {
63	err := C.UErrorCode(0)
64	c.loc = C.CString(locale)
65	c.col = C.ucol_open(c.loc, &err)
66	if err > 0 {
67		return fmt.Errorf("failed opening collator for %q", locale)
68	} else if err < 0 {
69		loc := C.ucol_getLocaleByType(c.col, 0, &err)
70		fmt, ok := map[int]string{
71			-127: "warning: using default collator: %s",
72			-128: "warning: using fallback collator: %s",
73		}[int(err)]
74		if ok {
75			log.Printf(fmt, C.GoString(loc))
76		}
77	}
78	c.keyBuf = make([]byte, 0, growBufSize)
79	return nil
80}
81
82func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) {
83	if len(c.keyBuf) == cap(c.keyBuf) {
84		c.keyBuf = make([]byte, 0, growBufSize)
85	}
86	b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)]
87	return icuUInt8P(b), icuSLen(b)
88}
89
90func (c *icuCollator) extendBuf(n C.int32_t) []byte {
91	end := len(c.keyBuf) + int(n)
92	if end > cap(c.keyBuf) {
93		if len(c.keyBuf) == 0 {
94			log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize)
95		}
96		c.keyBuf = make([]byte, 0, growBufSize)
97		return nil
98	}
99	b := c.keyBuf[len(c.keyBuf):end]
100	c.keyBuf = c.keyBuf[:end]
101	return b
102}
103
104func (c *icuCollator) Close() error {
105	C.ucol_close(c.col)
106	C.free(unsafe.Pointer(c.loc))
107	return nil
108}
109
110// icuUTF16 implements the Collator interface.
111type icuUTF16 struct {
112	icuCollator
113}
114
115func newUTF16(locale string) (Collator, error) {
116	c := &icuUTF16{}
117	return c, c.init(locale)
118}
119
120func (c *icuUTF16) Compare(a, b Input) int {
121	return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16)))
122}
123
124func (c *icuUTF16) Key(s Input) []byte {
125	bp, bn := c.buf()
126	n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn)
127	if b := c.extendBuf(n); b != nil {
128		return b
129	}
130	return c.Key(s)
131}
132
133// icuUTF8iter implements the Collator interface
134// This implementation wraps the UTF8 string in an iterator
135// which is passed to the collator.
136type icuUTF8iter struct {
137	icuCollator
138	a, b C.UCharIterator
139}
140
141func newUTF8iter(locale string) (Collator, error) {
142	c := &icuUTF8iter{}
143	return c, c.init(locale)
144}
145
146func (c *icuUTF8iter) Compare(a, b Input) int {
147	err := C.UErrorCode(0)
148	C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8))
149	C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8))
150	return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err))
151}
152
153func (c *icuUTF8iter) Key(s Input) []byte {
154	err := C.UErrorCode(0)
155	state := [2]C.uint32_t{}
156	C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8))
157	bp, bn := c.buf()
158	n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err)
159	if n >= bn {
160		// Force failure.
161		if c.extendBuf(n+1) != nil {
162			log.Fatal("expected extension to fail")
163		}
164		return c.Key(s)
165	}
166	return c.extendBuf(n)
167}
168
169// icuUTF8conv implements the Collator interface.
170// This implementation first converts the give UTF8 string
171// to UTF16 and then calls the main ICU collation function.
172type icuUTF8conv struct {
173	icuCollator
174}
175
176func newUTF8conv(locale string) (Collator, error) {
177	c := &icuUTF8conv{}
178	return c, c.init(locale)
179}
180
181func (c *icuUTF8conv) Compare(sa, sb Input) int {
182	a := encodeUTF16(sa.UTF8)
183	b := encodeUTF16(sb.UTF8)
184	return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b)))
185}
186
187func (c *icuUTF8conv) Key(s Input) []byte {
188	a := encodeUTF16(s.UTF8)
189	bp, bn := c.buf()
190	n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn)
191	if b := c.extendBuf(n); b != nil {
192		return b
193	}
194	return c.Key(s)
195}
196
197func encodeUTF16(b []byte) []uint16 {
198	a := []uint16{}
199	for len(b) > 0 {
200		r, sz := utf8.DecodeRune(b)
201		b = b[sz:]
202		r1, r2 := utf16.EncodeRune(r)
203		if r1 != 0xFFFD {
204			a = append(a, uint16(r1), uint16(r2))
205		} else {
206			a = append(a, uint16(r))
207		}
208	}
209	return a
210}
211