1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build icu 6 7package main 8 9/* 10#cgo LDFLAGS: -licui18n -licuuc 11#include <stdlib.h> 12#include <unicode/ucol.h> 13#include <unicode/uiter.h> 14#include <unicode/utypes.h> 15*/ 16import "C" 17import ( 18 "fmt" 19 "log" 20 "unicode/utf16" 21 "unicode/utf8" 22 "unsafe" 23) 24 25func init() { 26 AddFactory(CollatorFactory{"icu", newUTF16, 27 "Main ICU collator, using native strings."}) 28 AddFactory(CollatorFactory{"icu8", newUTF8iter, 29 "ICU collator using ICU iterators to process UTF8."}) 30 AddFactory(CollatorFactory{"icu16", newUTF8conv, 31 "ICU collation by first converting UTF8 to UTF16."}) 32} 33 34func icuCharP(s []byte) *C.char { 35 return (*C.char)(unsafe.Pointer(&s[0])) 36} 37 38func icuUInt8P(s []byte) *C.uint8_t { 39 return (*C.uint8_t)(unsafe.Pointer(&s[0])) 40} 41 42func icuUCharP(s []uint16) *C.UChar { 43 return (*C.UChar)(unsafe.Pointer(&s[0])) 44} 45func icuULen(s []uint16) C.int32_t { 46 return C.int32_t(len(s)) 47} 48func icuSLen(s []byte) C.int32_t { 49 return C.int32_t(len(s)) 50} 51 52// icuCollator implements a Collator based on ICU. 53type icuCollator struct { 54 loc *C.char 55 col *C.UCollator 56 keyBuf []byte 57} 58 59const growBufSize = 10 * 1024 * 1024 60 61func (c *icuCollator) init(locale string) error { 62 err := C.UErrorCode(0) 63 c.loc = C.CString(locale) 64 c.col = C.ucol_open(c.loc, &err) 65 if err > 0 { 66 return fmt.Errorf("failed opening collator for %q", locale) 67 } else if err < 0 { 68 loc := C.ucol_getLocaleByType(c.col, 0, &err) 69 fmt, ok := map[int]string{ 70 -127: "warning: using default collator: %s", 71 -128: "warning: using fallback collator: %s", 72 }[int(err)] 73 if ok { 74 log.Printf(fmt, C.GoString(loc)) 75 } 76 } 77 c.keyBuf = make([]byte, 0, growBufSize) 78 return nil 79} 80 81func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) { 82 if len(c.keyBuf) == cap(c.keyBuf) { 83 c.keyBuf = make([]byte, 0, growBufSize) 84 } 85 b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)] 86 return icuUInt8P(b), icuSLen(b) 87} 88 89func (c *icuCollator) extendBuf(n C.int32_t) []byte { 90 end := len(c.keyBuf) + int(n) 91 if end > cap(c.keyBuf) { 92 if len(c.keyBuf) == 0 { 93 log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize) 94 } 95 c.keyBuf = make([]byte, 0, growBufSize) 96 return nil 97 } 98 b := c.keyBuf[len(c.keyBuf):end] 99 c.keyBuf = c.keyBuf[:end] 100 return b 101} 102 103func (c *icuCollator) Close() error { 104 C.ucol_close(c.col) 105 C.free(unsafe.Pointer(c.loc)) 106 return nil 107} 108 109// icuUTF16 implements the Collator interface. 110type icuUTF16 struct { 111 icuCollator 112} 113 114func newUTF16(locale string) (Collator, error) { 115 c := &icuUTF16{} 116 return c, c.init(locale) 117} 118 119func (c *icuUTF16) Compare(a, b Input) int { 120 return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16))) 121} 122 123func (c *icuUTF16) Key(s Input) []byte { 124 bp, bn := c.buf() 125 n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn) 126 if b := c.extendBuf(n); b != nil { 127 return b 128 } 129 return c.Key(s) 130} 131 132// icuUTF8iter implements the Collator interface 133// This implementation wraps the UTF8 string in an iterator 134// which is passed to the collator. 135type icuUTF8iter struct { 136 icuCollator 137 a, b C.UCharIterator 138} 139 140func newUTF8iter(locale string) (Collator, error) { 141 c := &icuUTF8iter{} 142 return c, c.init(locale) 143} 144 145func (c *icuUTF8iter) Compare(a, b Input) int { 146 err := C.UErrorCode(0) 147 C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8)) 148 C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8)) 149 return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err)) 150} 151 152func (c *icuUTF8iter) Key(s Input) []byte { 153 err := C.UErrorCode(0) 154 state := [2]C.uint32_t{} 155 C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8)) 156 bp, bn := c.buf() 157 n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err) 158 if n >= bn { 159 // Force failure. 160 if c.extendBuf(n+1) != nil { 161 log.Fatal("expected extension to fail") 162 } 163 return c.Key(s) 164 } 165 return c.extendBuf(n) 166} 167 168// icuUTF8conv implements the Collator interface. 169// This implementation first converts the give UTF8 string 170// to UTF16 and then calls the main ICU collation function. 171type icuUTF8conv struct { 172 icuCollator 173} 174 175func newUTF8conv(locale string) (Collator, error) { 176 c := &icuUTF8conv{} 177 return c, c.init(locale) 178} 179 180func (c *icuUTF8conv) Compare(sa, sb Input) int { 181 a := encodeUTF16(sa.UTF8) 182 b := encodeUTF16(sb.UTF8) 183 return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b))) 184} 185 186func (c *icuUTF8conv) Key(s Input) []byte { 187 a := encodeUTF16(s.UTF8) 188 bp, bn := c.buf() 189 n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn) 190 if b := c.extendBuf(n); b != nil { 191 return b 192 } 193 return c.Key(s) 194} 195 196func encodeUTF16(b []byte) []uint16 { 197 a := []uint16{} 198 for len(b) > 0 { 199 r, sz := utf8.DecodeRune(b) 200 b = b[sz:] 201 r1, r2 := utf16.EncodeRune(r) 202 if r1 != 0xFFFD { 203 a = append(a, uint16(r1), uint16(r2)) 204 } else { 205 a = append(a, uint16(r)) 206 } 207 } 208 return a 209} 210