1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package colltab 6 7// An Iter incrementally converts chunks of the input text to collation 8// elements, while ensuring that the collation elements are in normalized order 9// (that is, they are in the order as if the input text were normalized first). 10type Iter struct { 11 Weighter Weighter 12 Elems []Elem 13 // N is the number of elements in Elems that will not be reordered on 14 // subsequent iterations, N <= len(Elems). 15 N int 16 17 bytes []byte 18 str string 19 // Because the Elems buffer may contain collation elements that are needed 20 // for look-ahead, we need two positions in the text (bytes or str): one for 21 // the end position in the text for the current iteration and one for the 22 // start of the next call to appendNext. 23 pEnd int // end position in text corresponding to N. 24 pNext int // pEnd <= pNext. 25} 26 27// Reset sets the position in the current input text to p and discards any 28// results obtained so far. 29func (i *Iter) Reset(p int) { 30 i.Elems = i.Elems[:0] 31 i.N = 0 32 i.pEnd = p 33 i.pNext = p 34} 35 36// Len returns the length of the input text. 37func (i *Iter) Len() int { 38 if i.bytes != nil { 39 return len(i.bytes) 40 } 41 return len(i.str) 42} 43 44// Discard removes the collation elements up to N. 45func (i *Iter) Discard() { 46 // TODO: change this such that only modifiers following starters will have 47 // to be copied. 48 i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])] 49 i.N = 0 50} 51 52// End returns the end position of the input text for which Next has returned 53// results. 54func (i *Iter) End() int { 55 return i.pEnd 56} 57 58// SetInput resets i to input s. 59func (i *Iter) SetInput(s []byte) { 60 i.bytes = s 61 i.str = "" 62 i.Reset(0) 63} 64 65// SetInputString resets i to input s. 66func (i *Iter) SetInputString(s string) { 67 i.str = s 68 i.bytes = nil 69 i.Reset(0) 70} 71 72func (i *Iter) done() bool { 73 return i.pNext >= len(i.str) && i.pNext >= len(i.bytes) 74} 75 76func (i *Iter) appendNext() bool { 77 if i.done() { 78 return false 79 } 80 var sz int 81 if i.bytes == nil { 82 i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:]) 83 } else { 84 i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:]) 85 } 86 if sz == 0 { 87 sz = 1 88 } 89 i.pNext += sz 90 return true 91} 92 93// Next appends Elems to the internal array. On each iteration, it will either 94// add starters or modifiers. In the majority of cases, an Elem with a primary 95// value > 0 will have a CCC of 0. The CCC values of collation elements are also 96// used to detect if the input string was not normalized and to adjust the 97// result accordingly. 98func (i *Iter) Next() bool { 99 if i.N == len(i.Elems) && !i.appendNext() { 100 return false 101 } 102 103 // Check if the current segment starts with a starter. 104 prevCCC := i.Elems[len(i.Elems)-1].CCC() 105 if prevCCC == 0 { 106 i.N = len(i.Elems) 107 i.pEnd = i.pNext 108 return true 109 } else if i.Elems[i.N].CCC() == 0 { 110 // set i.N to only cover part of i.Elems for which prevCCC == 0 and 111 // use rest for the next call to next. 112 for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ { 113 } 114 i.pEnd = i.pNext 115 return true 116 } 117 118 // The current (partial) segment starts with modifiers. We need to collect 119 // all successive modifiers to ensure that they are normalized. 120 for { 121 p := len(i.Elems) 122 i.pEnd = i.pNext 123 if !i.appendNext() { 124 break 125 } 126 127 if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters { 128 // Leave the starter for the next iteration. This ensures that we 129 // do not return sequences of collation elements that cross two 130 // segments. 131 // 132 // TODO: handle large number of combining characters by fully 133 // normalizing the input segment before iteration. This ensures 134 // results are consistent across the text repo. 135 i.N = p 136 return true 137 } else if ccc < prevCCC { 138 i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC. 139 } else { 140 prevCCC = ccc 141 } 142 } 143 144 done := len(i.Elems) != i.N 145 i.N = len(i.Elems) 146 return done 147} 148 149// nextNoNorm is the same as next, but does not "normalize" the collation 150// elements. 151func (i *Iter) nextNoNorm() bool { 152 // TODO: remove this function. Using this instead of next does not seem 153 // to improve performance in any significant way. We retain this until 154 // later for evaluation purposes. 155 if i.done() { 156 return false 157 } 158 i.appendNext() 159 i.N = len(i.Elems) 160 return true 161} 162 163const maxCombiningCharacters = 30 164 165// doNorm reorders the collation elements in i.Elems. 166// It assumes that blocks of collation elements added with appendNext 167// either start and end with the same CCC or start with CCC == 0. 168// This allows for a single insertion point for the entire block. 169// The correctness of this assumption is verified in builder.go. 170func (i *Iter) doNorm(p int, ccc uint8) { 171 n := len(i.Elems) 172 k := p 173 for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- { 174 } 175 i.Elems = append(i.Elems, i.Elems[p:k]...) 176 copy(i.Elems[p:], i.Elems[k:]) 177 i.Elems = i.Elems[:n] 178} 179