1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package collate 6 7import ( 8 "unicode" 9) 10 11// Level identifies the collation comparison level. 12// The primary level corresponds to the basic sorting of text. 13// The secondary level corresponds to accents and related linguistic elements. 14// The tertiary level corresponds to casing and related concepts. 15// The quaternary level is derived from the other levels by the 16// various algorithms for handling variable elements. 17type Level int 18 19const ( 20 Primary Level = iota 21 Secondary 22 Tertiary 23 Quaternary 24 Identity 25) 26 27const ( 28 defaultSecondary = 0x20 29 defaultTertiary = 0x2 30 maxTertiary = 0x1F 31 MaxQuaternary = 0x1FFFFF // 21 bits. 32) 33 34// Elem is a representation of a collation element. This API provides ways to encode 35// and decode Elems. Implementations of collation tables may use values greater 36// or equal to PrivateUse for their own purposes. However, these should never be 37// returned by AppendNext. 38type Elem uint32 39 40const ( 41 maxCE Elem = 0xAFFFFFFF 42 PrivateUse = minContract 43 minContract = 0xC0000000 44 maxContract = 0xDFFFFFFF 45 minExpand = 0xE0000000 46 maxExpand = 0xEFFFFFFF 47 minDecomp = 0xF0000000 48) 49 50type ceType int 51 52const ( 53 ceNormal ceType = iota // ceNormal includes implicits (ce == 0) 54 ceContractionIndex // rune can be a start of a contraction 55 ceExpansionIndex // rune expands into a sequence of collation elements 56 ceDecompose // rune expands using NFKC decomposition 57) 58 59func (ce Elem) ctype() ceType { 60 if ce <= maxCE { 61 return ceNormal 62 } 63 if ce <= maxContract { 64 return ceContractionIndex 65 } else { 66 if ce <= maxExpand { 67 return ceExpansionIndex 68 } 69 return ceDecompose 70 } 71 panic("should not reach here") 72 return ceType(-1) 73} 74 75// For normal collation elements, we assume that a collation element either has 76// a primary or non-default secondary value, not both. 77// Collation elements with a primary value are of the form 78// 01pppppp pppppppp ppppppp0 ssssssss 79// - p* is primary collation value 80// - s* is the secondary collation value 81// 00pppppp pppppppp ppppppps sssttttt, where 82// - p* is primary collation value 83// - s* offset of secondary from default value. 84// - t* is the tertiary collation value 85// 100ttttt cccccccc pppppppp pppppppp 86// - t* is the tertiar collation value 87// - c* is the cannonical combining class 88// - p* is the primary collation value 89// Collation elements with a secondary value are of the form 90// 1010cccc ccccssss ssssssss tttttttt, where 91// - c* is the canonical combining class 92// - s* is the secondary collation value 93// - t* is the tertiary collation value 94// 11qqqqqq qqqqqqqq qqqqqqq0 00000000 95// - q* quaternary value 96const ( 97 ceTypeMask = 0xC0000000 98 ceTypeMaskExt = 0xE0000000 99 ceType1 = 0x40000000 100 ceType2 = 0x00000000 101 ceType3or4 = 0x80000000 102 ceType4 = 0xA0000000 103 ceTypeQ = 0xC0000000 104 ceIgnore = ceType4 105 firstNonPrimary = 0x80000000 106 lastSpecialPrimary = 0xA0000000 107 secondaryMask = 0x80000000 108 hasTertiaryMask = 0x40000000 109 primaryValueMask = 0x3FFFFE00 110 primaryShift = 9 111 compactPrimaryBits = 16 112 compactSecondaryShift = 5 113 minCompactSecondary = defaultSecondary - 4 114) 115 116func makeImplicitCE(primary int) Elem { 117 return ceType1 | Elem(primary<<primaryShift) | defaultSecondary 118} 119 120// MakeElem returns an Elem for the given values. It will return an error 121// if the given combination of values is invalid. 122func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) { 123 // TODO: implement 124 return 0, nil 125} 126 127// MakeQuaternary returns an Elem with the given quaternary value. 128func MakeQuaternary(v int) Elem { 129 return ceTypeQ | Elem(v<<primaryShift) 130} 131 132// Mask sets weights for any level smaller than l to 0. 133// The resulting Elem can be used to test for equality with 134// other Elems to which the same mask has been applied. 135func (ce Elem) Mask(l Level) uint32 { 136 return 0 137} 138 139// CCC returns the canoncial combining class associated with the underlying character, 140// if applicable, or 0 otherwise. 141func (ce Elem) CCC() uint8 { 142 if ce&ceType3or4 != 0 { 143 if ce&ceType4 == ceType3or4 { 144 return uint8(ce >> 16) 145 } 146 return uint8(ce >> 20) 147 } 148 return 0 149} 150 151// Primary returns the primary collation weight for ce. 152func (ce Elem) Primary() int { 153 if ce >= firstNonPrimary { 154 if ce > lastSpecialPrimary { 155 return 0 156 } 157 return int(uint16(ce)) 158 } 159 return int(ce&primaryValueMask) >> primaryShift 160} 161 162// Secondary returns the secondary collation weight for ce. 163func (ce Elem) Secondary() int { 164 switch ce & ceTypeMask { 165 case ceType1: 166 return int(uint8(ce)) 167 case ceType2: 168 return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF) 169 case ceType3or4: 170 if ce < ceType4 { 171 return defaultSecondary 172 } 173 return int(ce>>8) & 0xFFF 174 case ceTypeQ: 175 return 0 176 } 177 panic("should not reach here") 178} 179 180// Tertiary returns the tertiary collation weight for ce. 181func (ce Elem) Tertiary() uint8 { 182 if ce&hasTertiaryMask == 0 { 183 if ce&ceType3or4 == 0 { 184 return uint8(ce & 0x1F) 185 } 186 if ce&ceType4 == ceType4 { 187 return uint8(ce) 188 } 189 return uint8(ce>>24) & 0x1F // type 2 190 } else if ce&ceTypeMask == ceType1 { 191 return defaultTertiary 192 } 193 // ce is a quaternary value. 194 return 0 195} 196 197func (ce Elem) updateTertiary(t uint8) Elem { 198 if ce&ceTypeMask == ceType1 { 199 // convert to type 4 200 nce := ce & primaryValueMask 201 nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift 202 ce = nce 203 } else if ce&ceTypeMaskExt == ceType3or4 { 204 ce &= ^Elem(maxTertiary << 24) 205 return ce | (Elem(t) << 24) 206 } else { 207 // type 2 or 4 208 ce &= ^Elem(maxTertiary) 209 } 210 return ce | Elem(t) 211} 212 213// Quaternary returns the quaternary value if explicitly specified, 214// 0 if ce == ceIgnore, or MaxQuaternary otherwise. 215// Quaternary values are used only for shifted variants. 216func (ce Elem) Quaternary() int { 217 if ce&ceTypeMask == ceTypeQ { 218 return int(ce&primaryValueMask) >> primaryShift 219 } else if ce == ceIgnore { 220 return 0 221 } 222 return MaxQuaternary 223} 224 225// Weight returns the collation weight for the given level. 226func (ce Elem) Weight(l Level) int { 227 switch l { 228 case Primary: 229 return ce.Primary() 230 case Secondary: 231 return ce.Secondary() 232 case Tertiary: 233 return int(ce.Tertiary()) 234 case Quaternary: 235 return ce.Quaternary() 236 } 237 return 0 // return 0 (ignore) for undefined levels. 238} 239 240// For contractions, collation elements are of the form 241// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where 242// - n* is the size of the first node in the contraction trie. 243// - i* is the index of the first node in the contraction trie. 244// - b* is the offset into the contraction collation element table. 245// See contract.go for details on the contraction trie. 246const ( 247 maxNBits = 4 248 maxTrieIndexBits = 12 249 maxContractOffsetBits = 13 250) 251 252func splitContractIndex(ce Elem) (index, n, offset int) { 253 n = int(ce & (1<<maxNBits - 1)) 254 ce >>= maxNBits 255 index = int(ce & (1<<maxTrieIndexBits - 1)) 256 ce >>= maxTrieIndexBits 257 offset = int(ce & (1<<maxContractOffsetBits - 1)) 258 return 259} 260 261// For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb, 262// where b* is the index into the expansion sequence table. 263const maxExpandIndexBits = 16 264 265func splitExpandIndex(ce Elem) (index int) { 266 return int(uint16(ce)) 267} 268 269// Some runes can be expanded using NFKD decomposition. Instead of storing the full 270// sequence of collation elements, we decompose the rune and lookup the collation 271// elements for each rune in the decomposition and modify the tertiary weights. 272// The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where 273// - v* is the replacement tertiary weight for the first rune, 274// - w* is the replacement tertiary weight for the second rune, 275// Tertiary weights of subsequent runes should be replaced with maxTertiary. 276// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. 277func splitDecompose(ce Elem) (t1, t2 uint8) { 278 return uint8(ce), uint8(ce >> 8) 279} 280 281const ( 282 // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. 283 minUnified rune = 0x4E00 284 maxUnified = 0x9FFF 285 minCompatibility = 0xF900 286 maxCompatibility = 0xFAFF 287 minRare = 0x3400 288 maxRare = 0x4DBF 289) 290const ( 291 commonUnifiedOffset = 0x10000 292 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF 293 otherOffset = 0x50000 // largest rune in rare is U+2FA1D 294 illegalOffset = otherOffset + int(unicode.MaxRune) 295 maxPrimary = illegalOffset + 1 296) 297 298// implicitPrimary returns the primary weight for the a rune 299// for which there is no entry for the rune in the collation table. 300// We take a different approach from the one specified in 301// http://unicode.org/reports/tr10/#Implicit_Weights, 302// but preserve the resulting relative ordering of the runes. 303func implicitPrimary(r rune) int { 304 if unicode.Is(unicode.Ideographic, r) { 305 if r >= minUnified && r <= maxUnified { 306 // The most common case for CJK. 307 return int(r) + commonUnifiedOffset 308 } 309 if r >= minCompatibility && r <= maxCompatibility { 310 // This will typically not hit. The DUCET explicitly specifies mappings 311 // for all characters that do not decompose. 312 return int(r) + commonUnifiedOffset 313 } 314 return int(r) + rareUnifiedOffset 315 } 316 return int(r) + otherOffset 317} 318