1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package colltab 6 7import ( 8 "fmt" 9 "unicode" 10) 11 12// Level identifies the collation comparison level. 13// The primary level corresponds to the basic sorting of text. 14// The secondary level corresponds to accents and related linguistic elements. 15// The tertiary level corresponds to casing and related concepts. 16// The quaternary level is derived from the other levels by the 17// various algorithms for handling variable elements. 18type Level int 19 20const ( 21 Primary Level = iota 22 Secondary 23 Tertiary 24 Quaternary 25 Identity 26 27 NumLevels 28) 29 30const ( 31 defaultSecondary = 0x20 32 defaultTertiary = 0x2 33 maxTertiary = 0x1F 34 MaxQuaternary = 0x1FFFFF // 21 bits. 35) 36 37// Elem is a representation of a collation element. This API provides ways to encode 38// and decode Elems. Implementations of collation tables may use values greater 39// or equal to PrivateUse for their own purposes. However, these should never be 40// returned by AppendNext. 41type Elem uint32 42 43const ( 44 maxCE Elem = 0xAFFFFFFF 45 PrivateUse = minContract 46 minContract = 0xC0000000 47 maxContract = 0xDFFFFFFF 48 minExpand = 0xE0000000 49 maxExpand = 0xEFFFFFFF 50 minDecomp = 0xF0000000 51) 52 53type ceType int 54 55const ( 56 ceNormal ceType = iota // ceNormal includes implicits (ce == 0) 57 ceContractionIndex // rune can be a start of a contraction 58 ceExpansionIndex // rune expands into a sequence of collation elements 59 ceDecompose // rune expands using NFKC decomposition 60) 61 62func (ce Elem) ctype() ceType { 63 if ce <= maxCE { 64 return ceNormal 65 } 66 if ce <= maxContract { 67 return ceContractionIndex 68 } else { 69 if ce <= maxExpand { 70 return ceExpansionIndex 71 } 72 return ceDecompose 73 } 74 panic("should not reach here") 75 return ceType(-1) 76} 77 78// For normal collation elements, we assume that a collation element either has 79// a primary or non-default secondary value, not both. 80// Collation elements with a primary value are of the form 81// 01pppppp pppppppp ppppppp0 ssssssss 82// - p* is primary collation value 83// - s* is the secondary collation value 84// 00pppppp pppppppp ppppppps sssttttt, where 85// - p* is primary collation value 86// - s* offset of secondary from default value. 87// - t* is the tertiary collation value 88// 100ttttt cccccccc pppppppp pppppppp 89// - t* is the tertiar collation value 90// - c* is the canonical combining class 91// - p* is the primary collation value 92// Collation elements with a secondary value are of the form 93// 1010cccc ccccssss ssssssss tttttttt, where 94// - c* is the canonical combining class 95// - s* is the secondary collation value 96// - t* is the tertiary collation value 97// 11qqqqqq qqqqqqqq qqqqqqq0 00000000 98// - q* quaternary value 99const ( 100 ceTypeMask = 0xC0000000 101 ceTypeMaskExt = 0xE0000000 102 ceIgnoreMask = 0xF00FFFFF 103 ceType1 = 0x40000000 104 ceType2 = 0x00000000 105 ceType3or4 = 0x80000000 106 ceType4 = 0xA0000000 107 ceTypeQ = 0xC0000000 108 Ignore = ceType4 109 firstNonPrimary = 0x80000000 110 lastSpecialPrimary = 0xA0000000 111 secondaryMask = 0x80000000 112 hasTertiaryMask = 0x40000000 113 primaryValueMask = 0x3FFFFE00 114 maxPrimaryBits = 21 115 compactPrimaryBits = 16 116 maxSecondaryBits = 12 117 maxTertiaryBits = 8 118 maxCCCBits = 8 119 maxSecondaryCompactBits = 8 120 maxSecondaryDiffBits = 4 121 maxTertiaryCompactBits = 5 122 primaryShift = 9 123 compactSecondaryShift = 5 124 minCompactSecondary = defaultSecondary - 4 125) 126 127func makeImplicitCE(primary int) Elem { 128 return ceType1 | Elem(primary<<primaryShift) | defaultSecondary 129} 130 131// MakeElem returns an Elem for the given values. It will return an error 132// if the given combination of values is invalid. 133func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) { 134 if w := primary; w >= 1<<maxPrimaryBits || w < 0 { 135 return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits) 136 } 137 if w := secondary; w >= 1<<maxSecondaryBits || w < 0 { 138 return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits) 139 } 140 if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 { 141 return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits) 142 } 143 ce := Elem(0) 144 if primary != 0 { 145 if ccc != 0 { 146 if primary >= 1<<compactPrimaryBits { 147 return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits) 148 } 149 if secondary != defaultSecondary { 150 return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc) 151 } 152 ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits)) 153 ce |= Elem(ccc) << compactPrimaryBits 154 ce |= Elem(primary) 155 ce |= ceType3or4 156 } else if tertiary == defaultTertiary { 157 if secondary >= 1<<maxSecondaryCompactBits { 158 return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits) 159 } 160 ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary) 161 ce |= ceType1 162 } else { 163 d := secondary - defaultSecondary + maxSecondaryDiffBits 164 if d >= 1<<maxSecondaryDiffBits || d < 0 { 165 return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits) 166 } 167 if tertiary >= 1<<maxTertiaryCompactBits { 168 return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits) 169 } 170 ce = Elem(primary<<maxSecondaryDiffBits + d) 171 ce = ce<<maxTertiaryCompactBits + Elem(tertiary) 172 } 173 } else { 174 ce = Elem(secondary<<maxTertiaryBits + tertiary) 175 ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits) 176 ce |= ceType4 177 } 178 return ce, nil 179} 180 181// MakeQuaternary returns an Elem with the given quaternary value. 182func MakeQuaternary(v int) Elem { 183 return ceTypeQ | Elem(v<<primaryShift) 184} 185 186// Mask sets weights for any level smaller than l to 0. 187// The resulting Elem can be used to test for equality with 188// other Elems to which the same mask has been applied. 189func (ce Elem) Mask(l Level) uint32 { 190 return 0 191} 192 193// CCC returns the canonical combining class associated with the underlying character, 194// if applicable, or 0 otherwise. 195func (ce Elem) CCC() uint8 { 196 if ce&ceType3or4 != 0 { 197 if ce&ceType4 == ceType3or4 { 198 return uint8(ce >> 16) 199 } 200 return uint8(ce >> 20) 201 } 202 return 0 203} 204 205// Primary returns the primary collation weight for ce. 206func (ce Elem) Primary() int { 207 if ce >= firstNonPrimary { 208 if ce > lastSpecialPrimary { 209 return 0 210 } 211 return int(uint16(ce)) 212 } 213 return int(ce&primaryValueMask) >> primaryShift 214} 215 216// Secondary returns the secondary collation weight for ce. 217func (ce Elem) Secondary() int { 218 switch ce & ceTypeMask { 219 case ceType1: 220 return int(uint8(ce)) 221 case ceType2: 222 return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF) 223 case ceType3or4: 224 if ce < ceType4 { 225 return defaultSecondary 226 } 227 return int(ce>>8) & 0xFFF 228 case ceTypeQ: 229 return 0 230 } 231 panic("should not reach here") 232} 233 234// Tertiary returns the tertiary collation weight for ce. 235func (ce Elem) Tertiary() uint8 { 236 if ce&hasTertiaryMask == 0 { 237 if ce&ceType3or4 == 0 { 238 return uint8(ce & 0x1F) 239 } 240 if ce&ceType4 == ceType4 { 241 return uint8(ce) 242 } 243 return uint8(ce>>24) & 0x1F // type 2 244 } else if ce&ceTypeMask == ceType1 { 245 return defaultTertiary 246 } 247 // ce is a quaternary value. 248 return 0 249} 250 251func (ce Elem) updateTertiary(t uint8) Elem { 252 if ce&ceTypeMask == ceType1 { 253 // convert to type 4 254 nce := ce & primaryValueMask 255 nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift 256 ce = nce 257 } else if ce&ceTypeMaskExt == ceType3or4 { 258 ce &= ^Elem(maxTertiary << 24) 259 return ce | (Elem(t) << 24) 260 } else { 261 // type 2 or 4 262 ce &= ^Elem(maxTertiary) 263 } 264 return ce | Elem(t) 265} 266 267// Quaternary returns the quaternary value if explicitly specified, 268// 0 if ce == Ignore, or MaxQuaternary otherwise. 269// Quaternary values are used only for shifted variants. 270func (ce Elem) Quaternary() int { 271 if ce&ceTypeMask == ceTypeQ { 272 return int(ce&primaryValueMask) >> primaryShift 273 } else if ce&ceIgnoreMask == Ignore { 274 return 0 275 } 276 return MaxQuaternary 277} 278 279// Weight returns the collation weight for the given level. 280func (ce Elem) Weight(l Level) int { 281 switch l { 282 case Primary: 283 return ce.Primary() 284 case Secondary: 285 return ce.Secondary() 286 case Tertiary: 287 return int(ce.Tertiary()) 288 case Quaternary: 289 return ce.Quaternary() 290 } 291 return 0 // return 0 (ignore) for undefined levels. 292} 293 294// For contractions, collation elements are of the form 295// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where 296// - n* is the size of the first node in the contraction trie. 297// - i* is the index of the first node in the contraction trie. 298// - b* is the offset into the contraction collation element table. 299// See contract.go for details on the contraction trie. 300const ( 301 maxNBits = 4 302 maxTrieIndexBits = 12 303 maxContractOffsetBits = 13 304) 305 306func splitContractIndex(ce Elem) (index, n, offset int) { 307 n = int(ce & (1<<maxNBits - 1)) 308 ce >>= maxNBits 309 index = int(ce & (1<<maxTrieIndexBits - 1)) 310 ce >>= maxTrieIndexBits 311 offset = int(ce & (1<<maxContractOffsetBits - 1)) 312 return 313} 314 315// For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb, 316// where b* is the index into the expansion sequence table. 317const maxExpandIndexBits = 16 318 319func splitExpandIndex(ce Elem) (index int) { 320 return int(uint16(ce)) 321} 322 323// Some runes can be expanded using NFKD decomposition. Instead of storing the full 324// sequence of collation elements, we decompose the rune and lookup the collation 325// elements for each rune in the decomposition and modify the tertiary weights. 326// The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where 327// - v* is the replacement tertiary weight for the first rune, 328// - w* is the replacement tertiary weight for the second rune, 329// Tertiary weights of subsequent runes should be replaced with maxTertiary. 330// See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. 331func splitDecompose(ce Elem) (t1, t2 uint8) { 332 return uint8(ce), uint8(ce >> 8) 333} 334 335const ( 336 // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. 337 minUnified rune = 0x4E00 338 maxUnified = 0x9FFF 339 minCompatibility = 0xF900 340 maxCompatibility = 0xFAFF 341 minRare = 0x3400 342 maxRare = 0x4DBF 343) 344const ( 345 commonUnifiedOffset = 0x10000 346 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF 347 otherOffset = 0x50000 // largest rune in rare is U+2FA1D 348 illegalOffset = otherOffset + int(unicode.MaxRune) 349 maxPrimary = illegalOffset + 1 350) 351 352// implicitPrimary returns the primary weight for the a rune 353// for which there is no entry for the rune in the collation table. 354// We take a different approach from the one specified in 355// https://unicode.org/reports/tr10/#Implicit_Weights, 356// but preserve the resulting relative ordering of the runes. 357func implicitPrimary(r rune) int { 358 if unicode.Is(unicode.Ideographic, r) { 359 if r >= minUnified && r <= maxUnified { 360 // The most common case for CJK. 361 return int(r) + commonUnifiedOffset 362 } 363 if r >= minCompatibility && r <= maxCompatibility { 364 // This will typically not hit. The DUCET explicitly specifies mappings 365 // for all characters that do not decompose. 366 return int(r) + commonUnifiedOffset 367 } 368 return int(r) + rareUnifiedOffset 369 } 370 return int(r) + otherOffset 371} 372