1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package build 6 7import ( 8 "fmt" 9 "unicode" 10 11 "golang.org/x/text/internal/colltab" 12) 13 14const ( 15 defaultSecondary = 0x20 16 defaultTertiary = 0x2 17 maxTertiary = 0x1F 18) 19 20type rawCE struct { 21 w []int 22 ccc uint8 23} 24 25func makeRawCE(w []int, ccc uint8) rawCE { 26 ce := rawCE{w: make([]int, 4), ccc: ccc} 27 copy(ce.w, w) 28 return ce 29} 30 31// A collation element is represented as an uint32. 32// In the typical case, a rune maps to a single collation element. If a rune 33// can be the start of a contraction or expands into multiple collation elements, 34// then the collation element that is associated with a rune will have a special 35// form to represent such m to n mappings. Such special collation elements 36// have a value >= 0x80000000. 37 38const ( 39 maxPrimaryBits = 21 40 maxSecondaryBits = 12 41 maxTertiaryBits = 8 42) 43 44func makeCE(ce rawCE) (uint32, error) { 45 v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc) 46 return uint32(v), e 47} 48 49// For contractions, collation elements are of the form 50// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where 51// - n* is the size of the first node in the contraction trie. 52// - i* is the index of the first node in the contraction trie. 53// - b* is the offset into the contraction collation element table. 54// See contract.go for details on the contraction trie. 55const ( 56 contractID = 0xC0000000 57 maxNBits = 4 58 maxTrieIndexBits = 12 59 maxContractOffsetBits = 13 60) 61 62func makeContractIndex(h ctHandle, offset int) (uint32, error) { 63 if h.n >= 1<<maxNBits { 64 return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits) 65 } 66 if h.index >= 1<<maxTrieIndexBits { 67 return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits) 68 } 69 if offset >= 1<<maxContractOffsetBits { 70 return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits) 71 } 72 ce := uint32(contractID) 73 ce += uint32(offset << (maxNBits + maxTrieIndexBits)) 74 ce += uint32(h.index << maxNBits) 75 ce += uint32(h.n) 76 return ce, nil 77} 78 79// For expansions, collation elements are of the form 80// 11100000 00000000 bbbbbbbb bbbbbbbb, 81// where b* is the index into the expansion sequence table. 82const ( 83 expandID = 0xE0000000 84 maxExpandIndexBits = 16 85) 86 87func makeExpandIndex(index int) (uint32, error) { 88 if index >= 1<<maxExpandIndexBits { 89 return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits) 90 } 91 return expandID + uint32(index), nil 92} 93 94// Each list of collation elements corresponding to an expansion starts with 95// a header indicating the length of the sequence. 96func makeExpansionHeader(n int) (uint32, error) { 97 return uint32(n), nil 98} 99 100// Some runes can be expanded using NFKD decomposition. Instead of storing the full 101// sequence of collation elements, we decompose the rune and lookup the collation 102// elements for each rune in the decomposition and modify the tertiary weights. 103// The collation element, in this case, is of the form 104// 11110000 00000000 wwwwwwww vvvvvvvv, where 105// - v* is the replacement tertiary weight for the first rune, 106// - w* is the replacement tertiary weight for the second rune, 107// Tertiary weights of subsequent runes should be replaced with maxTertiary. 108// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. 109const ( 110 decompID = 0xF0000000 111) 112 113func makeDecompose(t1, t2 int) (uint32, error) { 114 if t1 >= 256 || t1 < 0 { 115 return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1) 116 } 117 if t2 >= 256 || t2 < 0 { 118 return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2) 119 } 120 return uint32(t2<<8+t1) + decompID, nil 121} 122 123const ( 124 // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. 125 minUnified rune = 0x4E00 126 maxUnified = 0x9FFF 127 minCompatibility = 0xF900 128 maxCompatibility = 0xFAFF 129 minRare = 0x3400 130 maxRare = 0x4DBF 131) 132const ( 133 commonUnifiedOffset = 0x10000 134 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF 135 otherOffset = 0x50000 // largest rune in rare is U+2FA1D 136 illegalOffset = otherOffset + int(unicode.MaxRune) 137 maxPrimary = illegalOffset + 1 138) 139 140// implicitPrimary returns the primary weight for the a rune 141// for which there is no entry for the rune in the collation table. 142// We take a different approach from the one specified in 143// http://unicode.org/reports/tr10/#Implicit_Weights, 144// but preserve the resulting relative ordering of the runes. 145func implicitPrimary(r rune) int { 146 if unicode.Is(unicode.Ideographic, r) { 147 if r >= minUnified && r <= maxUnified { 148 // The most common case for CJK. 149 return int(r) + commonUnifiedOffset 150 } 151 if r >= minCompatibility && r <= maxCompatibility { 152 // This will typically not hit. The DUCET explicitly specifies mappings 153 // for all characters that do not decompose. 154 return int(r) + commonUnifiedOffset 155 } 156 return int(r) + rareUnifiedOffset 157 } 158 return int(r) + otherOffset 159} 160 161// convertLargeWeights converts collation elements with large 162// primaries (either double primaries or for illegal runes) 163// to our own representation. 164// A CJK character C is represented in the DUCET as 165// [.FBxx.0020.0002.C][.BBBB.0000.0000.C] 166// We will rewrite these characters to a single CE. 167// We assume the CJK values start at 0x8000. 168// See http://unicode.org/reports/tr10/#Implicit_Weights 169func convertLargeWeights(elems []rawCE) (res []rawCE, err error) { 170 const ( 171 cjkPrimaryStart = 0xFB40 172 rarePrimaryStart = 0xFB80 173 otherPrimaryStart = 0xFBC0 174 illegalPrimary = 0xFFFE 175 highBitsMask = 0x3F 176 lowBitsMask = 0x7FFF 177 lowBitsFlag = 0x8000 178 shiftBits = 15 179 ) 180 for i := 0; i < len(elems); i++ { 181 ce := elems[i].w 182 p := ce[0] 183 if p < cjkPrimaryStart { 184 continue 185 } 186 if p > 0xFFFF { 187 return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p) 188 } 189 if p >= illegalPrimary { 190 ce[0] = illegalOffset + p - illegalPrimary 191 } else { 192 if i+1 >= len(elems) { 193 return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) 194 } 195 if elems[i+1].w[0]&lowBitsFlag == 0 { 196 return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) 197 } 198 np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask 199 switch { 200 case p < rarePrimaryStart: 201 np += commonUnifiedOffset 202 case p < otherPrimaryStart: 203 np += rareUnifiedOffset 204 default: 205 p += otherOffset 206 } 207 ce[0] = np 208 for j := i + 1; j+1 < len(elems); j++ { 209 elems[j] = elems[j+1] 210 } 211 elems = elems[:len(elems)-1] 212 } 213 } 214 return elems, nil 215} 216 217// nextWeight computes the first possible collation weights following elems 218// for the given level. 219func nextWeight(level colltab.Level, elems []rawCE) []rawCE { 220 if level == colltab.Identity { 221 next := make([]rawCE, len(elems)) 222 copy(next, elems) 223 return next 224 } 225 next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)} 226 next[0].w[level]++ 227 if level < colltab.Secondary { 228 next[0].w[colltab.Secondary] = defaultSecondary 229 } 230 if level < colltab.Tertiary { 231 next[0].w[colltab.Tertiary] = defaultTertiary 232 } 233 // Filter entries that cannot influence ordering. 234 for _, ce := range elems[1:] { 235 skip := true 236 for i := colltab.Primary; i < level; i++ { 237 skip = skip && ce.w[i] == 0 238 } 239 if !skip { 240 next = append(next, ce) 241 } 242 } 243 return next 244} 245 246func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) { 247 for ; i < len(elems) && elems[i].w[level] == 0; i++ { 248 } 249 if i < len(elems) { 250 return i, elems[i].w[level] 251 } 252 return i, 0 253} 254 255// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise. 256// It also returns the collation level at which the difference is found. 257func compareWeights(a, b []rawCE) (result int, level colltab.Level) { 258 for level := colltab.Primary; level < colltab.Identity; level++ { 259 var va, vb int 260 for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 { 261 ia, va = nextVal(a, ia, level) 262 ib, vb = nextVal(b, ib, level) 263 if va != vb { 264 if va < vb { 265 return -1, level 266 } else { 267 return 1, level 268 } 269 } 270 } 271 } 272 return 0, colltab.Identity 273} 274 275func equalCE(a, b rawCE) bool { 276 for i := 0; i < 3; i++ { 277 if b.w[i] != a.w[i] { 278 return false 279 } 280 } 281 return true 282} 283 284func equalCEArrays(a, b []rawCE) bool { 285 if len(a) != len(b) { 286 return false 287 } 288 for i := range a { 289 if !equalCE(a[i], b[i]) { 290 return false 291 } 292 } 293 return true 294} 295