1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build icu 6// +build icu 7 8package main 9 10/* 11#cgo LDFLAGS: -licui18n -licuuc 12#include <stdlib.h> 13#include <unicode/ucol.h> 14#include <unicode/uiter.h> 15#include <unicode/utypes.h> 16*/ 17import "C" 18import ( 19 "fmt" 20 "log" 21 "unicode/utf16" 22 "unicode/utf8" 23 "unsafe" 24) 25 26func init() { 27 AddFactory(CollatorFactory{"icu", newUTF16, 28 "Main ICU collator, using native strings."}) 29 AddFactory(CollatorFactory{"icu8", newUTF8iter, 30 "ICU collator using ICU iterators to process UTF8."}) 31 AddFactory(CollatorFactory{"icu16", newUTF8conv, 32 "ICU collation by first converting UTF8 to UTF16."}) 33} 34 35func icuCharP(s []byte) *C.char { 36 return (*C.char)(unsafe.Pointer(&s[0])) 37} 38 39func icuUInt8P(s []byte) *C.uint8_t { 40 return (*C.uint8_t)(unsafe.Pointer(&s[0])) 41} 42 43func icuUCharP(s []uint16) *C.UChar { 44 return (*C.UChar)(unsafe.Pointer(&s[0])) 45} 46func icuULen(s []uint16) C.int32_t { 47 return C.int32_t(len(s)) 48} 49func icuSLen(s []byte) C.int32_t { 50 return C.int32_t(len(s)) 51} 52 53// icuCollator implements a Collator based on ICU. 54type icuCollator struct { 55 loc *C.char 56 col *C.UCollator 57 keyBuf []byte 58} 59 60const growBufSize = 10 * 1024 * 1024 61 62func (c *icuCollator) init(locale string) error { 63 err := C.UErrorCode(0) 64 c.loc = C.CString(locale) 65 c.col = C.ucol_open(c.loc, &err) 66 if err > 0 { 67 return fmt.Errorf("failed opening collator for %q", locale) 68 } else if err < 0 { 69 loc := C.ucol_getLocaleByType(c.col, 0, &err) 70 fmt, ok := map[int]string{ 71 -127: "warning: using default collator: %s", 72 -128: "warning: using fallback collator: %s", 73 }[int(err)] 74 if ok { 75 log.Printf(fmt, C.GoString(loc)) 76 } 77 } 78 c.keyBuf = make([]byte, 0, growBufSize) 79 return nil 80} 81 82func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) { 83 if len(c.keyBuf) == cap(c.keyBuf) { 84 c.keyBuf = make([]byte, 0, growBufSize) 85 } 86 b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)] 87 return icuUInt8P(b), icuSLen(b) 88} 89 90func (c *icuCollator) extendBuf(n C.int32_t) []byte { 91 end := len(c.keyBuf) + int(n) 92 if end > cap(c.keyBuf) { 93 if len(c.keyBuf) == 0 { 94 log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize) 95 } 96 c.keyBuf = make([]byte, 0, growBufSize) 97 return nil 98 } 99 b := c.keyBuf[len(c.keyBuf):end] 100 c.keyBuf = c.keyBuf[:end] 101 return b 102} 103 104func (c *icuCollator) Close() error { 105 C.ucol_close(c.col) 106 C.free(unsafe.Pointer(c.loc)) 107 return nil 108} 109 110// icuUTF16 implements the Collator interface. 111type icuUTF16 struct { 112 icuCollator 113} 114 115func newUTF16(locale string) (Collator, error) { 116 c := &icuUTF16{} 117 return c, c.init(locale) 118} 119 120func (c *icuUTF16) Compare(a, b Input) int { 121 return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16))) 122} 123 124func (c *icuUTF16) Key(s Input) []byte { 125 bp, bn := c.buf() 126 n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn) 127 if b := c.extendBuf(n); b != nil { 128 return b 129 } 130 return c.Key(s) 131} 132 133// icuUTF8iter implements the Collator interface 134// This implementation wraps the UTF8 string in an iterator 135// which is passed to the collator. 136type icuUTF8iter struct { 137 icuCollator 138 a, b C.UCharIterator 139} 140 141func newUTF8iter(locale string) (Collator, error) { 142 c := &icuUTF8iter{} 143 return c, c.init(locale) 144} 145 146func (c *icuUTF8iter) Compare(a, b Input) int { 147 err := C.UErrorCode(0) 148 C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8)) 149 C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8)) 150 return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err)) 151} 152 153func (c *icuUTF8iter) Key(s Input) []byte { 154 err := C.UErrorCode(0) 155 state := [2]C.uint32_t{} 156 C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8)) 157 bp, bn := c.buf() 158 n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err) 159 if n >= bn { 160 // Force failure. 161 if c.extendBuf(n+1) != nil { 162 log.Fatal("expected extension to fail") 163 } 164 return c.Key(s) 165 } 166 return c.extendBuf(n) 167} 168 169// icuUTF8conv implements the Collator interface. 170// This implementation first converts the give UTF8 string 171// to UTF16 and then calls the main ICU collation function. 172type icuUTF8conv struct { 173 icuCollator 174} 175 176func newUTF8conv(locale string) (Collator, error) { 177 c := &icuUTF8conv{} 178 return c, c.init(locale) 179} 180 181func (c *icuUTF8conv) Compare(sa, sb Input) int { 182 a := encodeUTF16(sa.UTF8) 183 b := encodeUTF16(sb.UTF8) 184 return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b))) 185} 186 187func (c *icuUTF8conv) Key(s Input) []byte { 188 a := encodeUTF16(s.UTF8) 189 bp, bn := c.buf() 190 n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn) 191 if b := c.extendBuf(n); b != nil { 192 return b 193 } 194 return c.Key(s) 195} 196 197func encodeUTF16(b []byte) []uint16 { 198 a := []uint16{} 199 for len(b) > 0 { 200 r, sz := utf8.DecodeRune(b) 201 b = b[sz:] 202 r1, r2 := utf16.EncodeRune(r) 203 if r1 != 0xFFFD { 204 a = append(a, uint16(r1), uint16(r2)) 205 } else { 206 a = append(a, uint16(r)) 207 } 208 } 209 return a 210} 211