1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build icu
6
7package main
8
9/*
10#cgo LDFLAGS: -licui18n -licuuc
11#include <stdlib.h>
12#include <unicode/ucol.h>
13#include <unicode/uiter.h>
14#include <unicode/utypes.h>
15*/
16import "C"
17import (
18	"fmt"
19	"log"
20	"unicode/utf16"
21	"unicode/utf8"
22	"unsafe"
23)
24
25func init() {
26	AddFactory(CollatorFactory{"icu", newUTF16,
27		"Main ICU collator, using native strings."})
28	AddFactory(CollatorFactory{"icu8", newUTF8iter,
29		"ICU collator using ICU iterators to process UTF8."})
30	AddFactory(CollatorFactory{"icu16", newUTF8conv,
31		"ICU collation by first converting UTF8 to UTF16."})
32}
33
34func icuCharP(s []byte) *C.char {
35	return (*C.char)(unsafe.Pointer(&s[0]))
36}
37
38func icuUInt8P(s []byte) *C.uint8_t {
39	return (*C.uint8_t)(unsafe.Pointer(&s[0]))
40}
41
42func icuUCharP(s []uint16) *C.UChar {
43	return (*C.UChar)(unsafe.Pointer(&s[0]))
44}
45func icuULen(s []uint16) C.int32_t {
46	return C.int32_t(len(s))
47}
48func icuSLen(s []byte) C.int32_t {
49	return C.int32_t(len(s))
50}
51
52// icuCollator implements a Collator based on ICU.
53type icuCollator struct {
54	loc    *C.char
55	col    *C.UCollator
56	keyBuf []byte
57}
58
59const growBufSize = 10 * 1024 * 1024
60
61func (c *icuCollator) init(locale string) error {
62	err := C.UErrorCode(0)
63	c.loc = C.CString(locale)
64	c.col = C.ucol_open(c.loc, &err)
65	if err > 0 {
66		return fmt.Errorf("failed opening collator for %q", locale)
67	} else if err < 0 {
68		loc := C.ucol_getLocaleByType(c.col, 0, &err)
69		fmt, ok := map[int]string{
70			-127: "warning: using default collator: %s",
71			-128: "warning: using fallback collator: %s",
72		}[int(err)]
73		if ok {
74			log.Printf(fmt, C.GoString(loc))
75		}
76	}
77	c.keyBuf = make([]byte, 0, growBufSize)
78	return nil
79}
80
81func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) {
82	if len(c.keyBuf) == cap(c.keyBuf) {
83		c.keyBuf = make([]byte, 0, growBufSize)
84	}
85	b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)]
86	return icuUInt8P(b), icuSLen(b)
87}
88
89func (c *icuCollator) extendBuf(n C.int32_t) []byte {
90	end := len(c.keyBuf) + int(n)
91	if end > cap(c.keyBuf) {
92		if len(c.keyBuf) == 0 {
93			log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize)
94		}
95		c.keyBuf = make([]byte, 0, growBufSize)
96		return nil
97	}
98	b := c.keyBuf[len(c.keyBuf):end]
99	c.keyBuf = c.keyBuf[:end]
100	return b
101}
102
103func (c *icuCollator) Close() error {
104	C.ucol_close(c.col)
105	C.free(unsafe.Pointer(c.loc))
106	return nil
107}
108
109// icuUTF16 implements the Collator interface.
110type icuUTF16 struct {
111	icuCollator
112}
113
114func newUTF16(locale string) (Collator, error) {
115	c := &icuUTF16{}
116	return c, c.init(locale)
117}
118
119func (c *icuUTF16) Compare(a, b Input) int {
120	return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16)))
121}
122
123func (c *icuUTF16) Key(s Input) []byte {
124	bp, bn := c.buf()
125	n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn)
126	if b := c.extendBuf(n); b != nil {
127		return b
128	}
129	return c.Key(s)
130}
131
132// icuUTF8iter implements the Collator interface
133// This implementation wraps the UTF8 string in an iterator
134// which is passed to the collator.
135type icuUTF8iter struct {
136	icuCollator
137	a, b C.UCharIterator
138}
139
140func newUTF8iter(locale string) (Collator, error) {
141	c := &icuUTF8iter{}
142	return c, c.init(locale)
143}
144
145func (c *icuUTF8iter) Compare(a, b Input) int {
146	err := C.UErrorCode(0)
147	C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8))
148	C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8))
149	return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err))
150}
151
152func (c *icuUTF8iter) Key(s Input) []byte {
153	err := C.UErrorCode(0)
154	state := [2]C.uint32_t{}
155	C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8))
156	bp, bn := c.buf()
157	n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err)
158	if n >= bn {
159		// Force failure.
160		if c.extendBuf(n+1) != nil {
161			log.Fatal("expected extension to fail")
162		}
163		return c.Key(s)
164	}
165	return c.extendBuf(n)
166}
167
168// icuUTF8conv implements the Collator interface.
169// This implementation first converts the give UTF8 string
170// to UTF16 and then calls the main ICU collation function.
171type icuUTF8conv struct {
172	icuCollator
173}
174
175func newUTF8conv(locale string) (Collator, error) {
176	c := &icuUTF8conv{}
177	return c, c.init(locale)
178}
179
180func (c *icuUTF8conv) Compare(sa, sb Input) int {
181	a := encodeUTF16(sa.UTF8)
182	b := encodeUTF16(sb.UTF8)
183	return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b)))
184}
185
186func (c *icuUTF8conv) Key(s Input) []byte {
187	a := encodeUTF16(s.UTF8)
188	bp, bn := c.buf()
189	n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn)
190	if b := c.extendBuf(n); b != nil {
191		return b
192	}
193	return c.Key(s)
194}
195
196func encodeUTF16(b []byte) []uint16 {
197	a := []uint16{}
198	for len(b) > 0 {
199		r, sz := utf8.DecodeRune(b)
200		b = b[sz:]
201		r1, r2 := utf16.EncodeRune(r)
202		if r1 != 0xFFFD {
203			a = append(a, uint16(r1), uint16(r2))
204		} else {
205			a = append(a, uint16(r))
206		}
207	}
208	return a
209}
210