1package uniseg 2 3import "unicode/utf8" 4 5// The states of the grapheme cluster parser. 6const ( 7 grAny = iota 8 grCR 9 grControlLF 10 grL 11 grLVV 12 grLVTT 13 grPrepend 14 grExtendedPictographic 15 grExtendedPictographicZWJ 16 grRIOdd 17 grRIEven 18) 19 20// The grapheme cluster parser's breaking instructions. 21const ( 22 grNoBoundary = iota 23 grBoundary 24) 25 26// The grapheme cluster parser's state transitions. Maps (state, property) to 27// (new state, breaking instruction, rule number). The breaking instruction 28// always refers to the boundary between the last and next code point. 29// 30// This map is queried as follows: 31// 32// 1. Find specific state + specific property. Stop if found. 33// 2. Find specific state + any property. 34// 3. Find any state + specific property. 35// 4. If only (2) or (3) (but not both) was found, stop. 36// 5. If both (2) and (3) were found, use state and breaking instruction from 37// the transition with the lower rule number, prefer (3) if rule numbers 38// are equal. Stop. 39// 6. Assume grAny and grBoundary. 40var grTransitions = map[[2]int][3]int{ 41 // GB5 42 {grAny, prCR}: {grCR, grBoundary, 50}, 43 {grAny, prLF}: {grControlLF, grBoundary, 50}, 44 {grAny, prControl}: {grControlLF, grBoundary, 50}, 45 46 // GB4 47 {grCR, prAny}: {grAny, grBoundary, 40}, 48 {grControlLF, prAny}: {grAny, grBoundary, 40}, 49 50 // GB3. 51 {grCR, prLF}: {grAny, grNoBoundary, 30}, 52 53 // GB6. 54 {grAny, prL}: {grL, grBoundary, 9990}, 55 {grL, prL}: {grL, grNoBoundary, 60}, 56 {grL, prV}: {grLVV, grNoBoundary, 60}, 57 {grL, prLV}: {grLVV, grNoBoundary, 60}, 58 {grL, prLVT}: {grLVTT, grNoBoundary, 60}, 59 60 // GB7. 61 {grAny, prLV}: {grLVV, grBoundary, 9990}, 62 {grAny, prV}: {grLVV, grBoundary, 9990}, 63 {grLVV, prV}: {grLVV, grNoBoundary, 70}, 64 {grLVV, prT}: {grLVTT, grNoBoundary, 70}, 65 66 // GB8. 67 {grAny, prLVT}: {grLVTT, grBoundary, 9990}, 68 {grAny, prT}: {grLVTT, grBoundary, 9990}, 69 {grLVTT, prT}: {grLVTT, grNoBoundary, 80}, 70 71 // GB9. 72 {grAny, prExtend}: {grAny, grNoBoundary, 90}, 73 {grAny, prZWJ}: {grAny, grNoBoundary, 90}, 74 75 // GB9a. 76 {grAny, prSpacingMark}: {grAny, grNoBoundary, 91}, 77 78 // GB9b. 79 {grAny, prPreprend}: {grPrepend, grBoundary, 9990}, 80 {grPrepend, prAny}: {grAny, grNoBoundary, 92}, 81 82 // GB11. 83 {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990}, 84 {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110}, 85 {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110}, 86 {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110}, 87 88 // GB12 / GB13. 89 {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990}, 90 {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120}, 91 {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120}, 92} 93 94// Graphemes implements an iterator over Unicode extended grapheme clusters, 95// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to 96// "user-perceived characters". These characters often consist of multiple 97// code points (e.g. the "woman kissing woman" emoji consists of 8 code points: 98// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ + 99// woman) and the rules described in Annex #29 must be applied to group those 100// code points into clusters perceived by the user as one character. 101type Graphemes struct { 102 // The code points over which this class iterates. 103 codePoints []rune 104 105 // The (byte-based) indices of the code points into the original string plus 106 // len(original string). Thus, len(indices) = len(codePoints) + 1. 107 indices []int 108 109 // The current grapheme cluster to be returned. These are indices into 110 // codePoints/indices. If start == end, we either haven't started iterating 111 // yet (0) or the iteration has already completed (1). 112 start, end int 113 114 // The index of the next code point to be parsed. 115 pos int 116 117 // The current state of the code point parser. 118 state int 119} 120 121// NewGraphemes returns a new grapheme cluster iterator. 122func NewGraphemes(s string) *Graphemes { 123 l := utf8.RuneCountInString(s) 124 codePoints := make([]rune, l) 125 indices := make([]int, l+1) 126 i := 0 127 for pos, r := range s { 128 codePoints[i] = r 129 indices[i] = pos 130 i++ 131 } 132 indices[l] = len(s) 133 g := &Graphemes{ 134 codePoints: codePoints, 135 indices: indices, 136 } 137 g.Next() // Parse ahead. 138 return g 139} 140 141// Next advances the iterator by one grapheme cluster and returns false if no 142// clusters are left. This function must be called before the first cluster is 143// accessed. 144func (g *Graphemes) Next() bool { 145 g.start = g.end 146 147 // The state transition gives us a boundary instruction BEFORE the next code 148 // point so we always need to stay ahead by one code point. 149 150 // Parse the next code point. 151 for g.pos <= len(g.codePoints) { 152 // GB2. 153 if g.pos == len(g.codePoints) { 154 g.end = g.pos 155 g.pos++ 156 break 157 } 158 159 // Determine the property of the next character. 160 nextProperty := property(g.codePoints[g.pos]) 161 g.pos++ 162 163 // Find the applicable transition. 164 var boundary bool 165 transition, ok := grTransitions[[2]int{g.state, nextProperty}] 166 if ok { 167 // We have a specific transition. We'll use it. 168 g.state = transition[0] 169 boundary = transition[1] == grBoundary 170 } else { 171 // No specific transition found. Try the less specific ones. 172 transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}] 173 transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}] 174 if okAnyProp && okAnyState { 175 // Both apply. We'll use a mix (see comments for grTransitions). 176 g.state = transAnyState[0] 177 boundary = transAnyState[1] == grBoundary 178 if transAnyProp[2] < transAnyState[2] { 179 g.state = transAnyProp[0] 180 boundary = transAnyProp[1] == grBoundary 181 } 182 } else if okAnyProp { 183 // We only have a specific state. 184 g.state = transAnyProp[0] 185 boundary = transAnyProp[1] == grBoundary 186 // This branch will probably never be reached because okAnyState will 187 // always be true given the current transition map. But we keep it here 188 // for future modifications to the transition map where this may not be 189 // true anymore. 190 } else if okAnyState { 191 // We only have a specific property. 192 g.state = transAnyState[0] 193 boundary = transAnyState[1] == grBoundary 194 } else { 195 // No known transition. GB999: Any x Any. 196 g.state = grAny 197 boundary = true 198 } 199 } 200 201 // If we found a cluster boundary, let's stop here. The current cluster will 202 // be the one that just ended. 203 if g.pos-1 == 0 /* GB1 */ || boundary { 204 g.end = g.pos - 1 205 break 206 } 207 } 208 209 return g.start != g.end 210} 211 212// Runes returns a slice of runes (code points) which corresponds to the current 213// grapheme cluster. If the iterator is already past the end or Next() has not 214// yet been called, nil is returned. 215func (g *Graphemes) Runes() []rune { 216 if g.start == g.end { 217 return nil 218 } 219 return g.codePoints[g.start:g.end] 220} 221 222// Str returns a substring of the original string which corresponds to the 223// current grapheme cluster. If the iterator is already past the end or Next() 224// has not yet been called, an empty string is returned. 225func (g *Graphemes) Str() string { 226 if g.start == g.end { 227 return "" 228 } 229 return string(g.codePoints[g.start:g.end]) 230} 231 232// Bytes returns a byte slice which corresponds to the current grapheme cluster. 233// If the iterator is already past the end or Next() has not yet been called, 234// nil is returned. 235func (g *Graphemes) Bytes() []byte { 236 if g.start == g.end { 237 return nil 238 } 239 return []byte(string(g.codePoints[g.start:g.end])) 240} 241 242// Positions returns the interval of the current grapheme cluster as byte 243// positions into the original string. The first returned value "from" indexes 244// the first byte and the second returned value "to" indexes the first byte that 245// is not included anymore, i.e. str[from:to] is the current grapheme cluster of 246// the original string "str". If Next() has not yet been called, both values are 247// 0. If the iterator is already past the end, both values are 1. 248func (g *Graphemes) Positions() (int, int) { 249 return g.indices[g.start], g.indices[g.end] 250} 251 252// Reset puts the iterator into its initial state such that the next call to 253// Next() sets it to the first grapheme cluster again. 254func (g *Graphemes) Reset() { 255 g.start, g.end, g.pos, g.state = 0, 0, 0, grAny 256 g.Next() // Parse ahead again. 257} 258 259// GraphemeClusterCount returns the number of user-perceived characters 260// (grapheme clusters) for the given string. To calculate this number, it 261// iterates through the string using the Graphemes iterator. 262func GraphemeClusterCount(s string) (n int) { 263 g := NewGraphemes(s) 264 for g.Next() { 265 n++ 266 } 267 return 268} 269