1// Copyright (c) 2015 Couchbase, Inc. 2// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file 3// except in compliance with the License. You may obtain a copy of the License at 4// http://www.apache.org/licenses/LICENSE-2.0 5// Unless required by applicable law or agreed to in writing, software distributed under the 6// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 7// either express or implied. See the License for the specific language governing permissions 8// and limitations under the License. 9 10// +build BUILDTAGS 11 12package segment 13 14import ( 15 "fmt" 16 "unicode/utf8" 17) 18 19var RagelFlags = "RAGELFLAGS" 20 21var ParseError = fmt.Errorf("unicode word segmentation parse error") 22 23// Word Types 24const ( 25 None = iota 26 Number 27 Letter 28 Kana 29 Ideo 30) 31 32%%{ 33 machine s; 34 write data; 35}%% 36 37func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) { 38 cs, p, pe := 0, 0, len(data) 39 cap := maxTokens 40 if cap < 0 { 41 cap = 1000 42 } 43 if val == nil { 44 val = make([][]byte, 0, cap) 45 } 46 if types == nil { 47 types = make([]int, 0, cap) 48 } 49 50 // added for scanner 51 ts := 0 52 te := 0 53 act := 0 54 eof := pe 55 _ = ts // compiler not happy 56 _ = te 57 _ = act 58 59 // our state 60 startPos := 0 61 endPos := 0 62 totalConsumed := 0 63 %%{ 64 65 include SCRIPTS "ragel/uscript.rl"; 66 include WB "ragel/uwb.rl"; 67 68 action startToken { 69 startPos = p 70 } 71 72 action endToken { 73 endPos = p 74 } 75 76 action finishNumericToken { 77 if !atEOF { 78 return val, types, totalConsumed, nil 79 } 80 81 val = append(val, data[startPos:endPos+1]) 82 types = append(types, Number) 83 totalConsumed = endPos+1 84 if maxTokens > 0 && len(val) >= maxTokens { 85 return val, types, totalConsumed, nil 86 } 87 } 88 89 action finishHangulToken { 90 if endPos+1 == pe && !atEOF { 91 return val, types, totalConsumed, nil 92 } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 { 93 return val, types, totalConsumed, nil 94 } 95 96 val = append(val, data[startPos:endPos+1]) 97 types = append(types, Letter) 98 totalConsumed = endPos+1 99 if maxTokens > 0 && len(val) >= maxTokens { 100 return val, types, totalConsumed, nil 101 } 102 } 103 104 action finishKatakanaToken { 105 if endPos+1 == pe && !atEOF { 106 return val, types, totalConsumed, nil 107 } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 { 108 return val, types, totalConsumed, nil 109 } 110 111 val = append(val, data[startPos:endPos+1]) 112 types = append(types, Ideo) 113 totalConsumed = endPos+1 114 if maxTokens > 0 && len(val) >= maxTokens { 115 return val, types, totalConsumed, nil 116 } 117 } 118 119 action finishWordToken { 120 if !atEOF { 121 return val, types, totalConsumed, nil 122 } 123 val = append(val, data[startPos:endPos+1]) 124 types = append(types, Letter) 125 totalConsumed = endPos+1 126 if maxTokens > 0 && len(val) >= maxTokens { 127 return val, types, totalConsumed, nil 128 } 129 } 130 131 action finishHanToken { 132 if endPos+1 == pe && !atEOF { 133 return val, types, totalConsumed, nil 134 } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 { 135 return val, types, totalConsumed, nil 136 } 137 138 val = append(val, data[startPos:endPos+1]) 139 types = append(types, Ideo) 140 totalConsumed = endPos+1 141 if maxTokens > 0 && len(val) >= maxTokens { 142 return val, types, totalConsumed, nil 143 } 144 } 145 146 action finishHiraganaToken { 147 if endPos+1 == pe && !atEOF { 148 return val, types, totalConsumed, nil 149 } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 { 150 return val, types, totalConsumed, nil 151 } 152 153 val = append(val, data[startPos:endPos+1]) 154 types = append(types, Ideo) 155 totalConsumed = endPos+1 156 if maxTokens > 0 && len(val) >= maxTokens { 157 return val, types, totalConsumed, nil 158 } 159 } 160 161 action finishNoneToken { 162 lastPos := startPos 163 for lastPos <= endPos { 164 _, size := utf8.DecodeRune(data[lastPos:]) 165 lastPos += size 166 } 167 endPos = lastPos -1 168 p = endPos 169 170 if endPos+1 == pe && !atEOF { 171 return val, types, totalConsumed, nil 172 } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 { 173 return val, types, totalConsumed, nil 174 } 175 // otherwise, consume this as well 176 val = append(val, data[startPos:endPos+1]) 177 types = append(types, None) 178 totalConsumed = endPos+1 179 if maxTokens > 0 && len(val) >= maxTokens { 180 return val, types, totalConsumed, nil 181 } 182 } 183 184 HangulEx = Hangul ( Extend | Format )*; 185 HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*; 186 NumericEx = Numeric ( Extend | Format )*; 187 KatakanaEx = Katakana ( Extend | Format )*; 188 MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*; 189 MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*; 190 ExtendNumLetEx = ExtendNumLet ( Extend | Format )*; 191 HanEx = Han ( Extend | Format )*; 192 HiraganaEx = Hiragana ( Extend | Format )*; 193 SingleQuoteEx = Single_Quote ( Extend | Format )*; 194 DoubleQuoteEx = Double_Quote ( Extend | Format )*; 195 HebrewLetterEx = Hebrew_Letter ( Extend | Format )*; 196 RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*; 197 NLCRLF = Newline | CR | LF; 198 OtherEx = ^(NLCRLF) ( Extend | Format )* ; 199 200 # UAX#29 WB8. Numeric × Numeric 201 # WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric 202 # WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric 203 # WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet 204 # WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 205 # 206 WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken; 207 208 # subset of the below for typing purposes only! 209 WordHangul = ( HangulEx )+ >startToken @endToken; 210 WordKatakana = ( KatakanaEx )+ >startToken @endToken; 211 212 # UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) 213 # WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 214 # WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter) 215 # WB7a. Hebrew_Letter × Single_Quote 216 # WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter 217 # WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter 218 # WB9. (ALetter | Hebrew_Letter) × Numeric 219 # WB10. Numeric × (ALetter | Hebrew_Letter) 220 # WB13. Katakana × Katakana 221 # WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet 222 # WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 223 # 224 # Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a 225 # 226 Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )* 227 | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx ) 228 | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* 229 | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )* 230 |ExtendNumLetEx 231 )+ 232 ) 233 ( 234 ( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )* 235 | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx ) 236 | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* 237 | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )* 238 )+ 239 ) 240 )* ExtendNumLetEx*) >startToken @endToken; 241 242 # UAX#29 WB14. Any ÷ Any 243 WordHan = HanEx >startToken @endToken; 244 WordHiragana = HiraganaEx >startToken @endToken; 245 246 WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star 247 248 WordCRLF = (CR LF) >startToken @endToken; 249 250 WordCR = CR >startToken @endToken; 251 252 WordLF = LF >startToken @endToken; 253 254 WordNL = Newline >startToken @endToken; 255 256 WordRegional = (RegionalIndicatorEx+) >startToken @endToken; 257 258 Other = OtherEx >startToken @endToken; 259 260 main := |* 261 WordNumeric => finishNumericToken; 262 WordHangul => finishHangulToken; 263 WordKatakana => finishKatakanaToken; 264 Word => finishWordToken; 265 WordHan => finishHanToken; 266 WordHiragana => finishHiraganaToken; 267 WordRegional =>finishNoneToken; 268 WordCRLF => finishNoneToken; 269 WordCR => finishNoneToken; 270 WordLF => finishNoneToken; 271 WordNL => finishNoneToken; 272 WordExt => finishNoneToken; 273 Other => finishNoneToken; 274 *|; 275 276 write init; 277 write exec; 278 }%% 279 280 if cs < s_first_final { 281 return val, types, totalConsumed, ParseError 282 } 283 284 return val, types, totalConsumed, nil 285} 286