1//  Copyright (c) 2015 Couchbase, Inc.
2//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
3//  except in compliance with the License. You may obtain a copy of the License at
4//    http://www.apache.org/licenses/LICENSE-2.0
5//  Unless required by applicable law or agreed to in writing, software distributed under the
6//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
7//  either express or implied. See the License for the specific language governing permissions
8//  and limitations under the License.
9
10// +build BUILDTAGS
11
12package segment
13
14import (
15  "fmt"
16  "unicode/utf8"
17)
18
19var RagelFlags = "RAGELFLAGS"
20
21var ParseError = fmt.Errorf("unicode word segmentation parse error")
22
23// Word Types
24const (
25  None = iota
26  Number
27  Letter
28  Kana
29  Ideo
30)
31
32%%{
33  machine s;
34  write data;
35}%%
36
37func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) {
38  cs, p, pe := 0, 0, len(data)
39  cap := maxTokens
40  if cap < 0 {
41    cap = 1000
42  }
43  if val == nil {
44    val = make([][]byte, 0, cap)
45  }
46  if types == nil {
47    types = make([]int, 0, cap)
48  }
49
50  // added for scanner
51  ts := 0
52  te := 0
53  act := 0
54  eof := pe
55  _ = ts // compiler not happy
56  _ = te
57  _ = act
58
59  // our state
60  startPos := 0
61  endPos := 0
62  totalConsumed := 0
63  %%{
64
65  include SCRIPTS "ragel/uscript.rl";
66  include WB "ragel/uwb.rl";
67
68  action startToken {
69    startPos = p
70  }
71
72  action endToken {
73    endPos = p
74  }
75
76  action finishNumericToken {
77    if !atEOF {
78      return val, types, totalConsumed, nil
79    }
80
81    val = append(val, data[startPos:endPos+1])
82    types = append(types, Number)
83    totalConsumed = endPos+1
84    if maxTokens > 0 && len(val) >= maxTokens {
85      return val, types, totalConsumed, nil
86    }
87  }
88
89  action finishHangulToken {
90    if endPos+1 == pe && !atEOF {
91      return val, types, totalConsumed, nil
92    } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
93      return val, types, totalConsumed, nil
94    }
95
96    val = append(val, data[startPos:endPos+1])
97    types = append(types, Letter)
98    totalConsumed = endPos+1
99    if maxTokens > 0 && len(val) >= maxTokens {
100      return val, types, totalConsumed, nil
101    }
102  }
103
104  action finishKatakanaToken {
105    if endPos+1 == pe && !atEOF {
106      return val, types, totalConsumed, nil
107    } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
108      return val, types, totalConsumed, nil
109    }
110
111    val = append(val, data[startPos:endPos+1])
112    types = append(types, Ideo)
113    totalConsumed = endPos+1
114    if maxTokens > 0 && len(val) >= maxTokens {
115      return val, types, totalConsumed, nil
116    }
117  }
118
119  action finishWordToken {
120    if !atEOF {
121      return val, types, totalConsumed, nil
122    }
123    val = append(val, data[startPos:endPos+1])
124    types = append(types, Letter)
125    totalConsumed = endPos+1
126    if maxTokens > 0 && len(val) >= maxTokens {
127      return val, types, totalConsumed, nil
128    }
129  }
130
131  action finishHanToken {
132    if endPos+1 == pe && !atEOF {
133      return val, types, totalConsumed, nil
134    } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
135      return val, types, totalConsumed, nil
136    }
137
138    val = append(val, data[startPos:endPos+1])
139    types = append(types, Ideo)
140    totalConsumed = endPos+1
141    if maxTokens > 0 && len(val) >= maxTokens {
142      return val, types, totalConsumed, nil
143    }
144  }
145
146  action finishHiraganaToken {
147    if endPos+1 == pe && !atEOF {
148      return val, types, totalConsumed, nil
149    } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
150      return val, types, totalConsumed, nil
151    }
152
153    val = append(val, data[startPos:endPos+1])
154    types = append(types, Ideo)
155    totalConsumed = endPos+1
156    if maxTokens > 0 && len(val) >= maxTokens {
157      return val, types, totalConsumed, nil
158    }
159  }
160
161  action finishNoneToken {
162    lastPos := startPos
163    for lastPos <= endPos {
164      _, size := utf8.DecodeRune(data[lastPos:])
165      lastPos += size
166    }
167    endPos = lastPos -1
168    p = endPos
169
170    if endPos+1 == pe && !atEOF {
171      return val, types, totalConsumed, nil
172    } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
173      return val, types, totalConsumed, nil
174    }
175    // otherwise, consume this as well
176    val = append(val, data[startPos:endPos+1])
177    types = append(types, None)
178    totalConsumed = endPos+1
179    if maxTokens > 0 && len(val) >= maxTokens {
180      return val, types, totalConsumed, nil
181    }
182  }
183
184  HangulEx = Hangul ( Extend | Format )*;
185  HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*;
186  NumericEx = Numeric ( Extend | Format )*;
187  KatakanaEx = Katakana ( Extend | Format )*;
188  MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*;
189  MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*;
190  ExtendNumLetEx = ExtendNumLet ( Extend | Format )*;
191  HanEx = Han ( Extend | Format )*;
192  HiraganaEx = Hiragana ( Extend | Format )*;
193  SingleQuoteEx = Single_Quote ( Extend | Format )*;
194  DoubleQuoteEx = Double_Quote ( Extend | Format )*;
195  HebrewLetterEx = Hebrew_Letter ( Extend | Format )*;
196  RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*;
197  NLCRLF = Newline | CR | LF;
198  OtherEx = ^(NLCRLF) ( Extend | Format )* ;
199
200  # UAX#29 WB8.   Numeric × Numeric
201  #        WB11.  Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
202  #       WB12.  Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
203  #       WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
204  #       WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
205  #
206  WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken;
207
208  # subset of the below for typing purposes only!
209  WordHangul = ( HangulEx )+ >startToken @endToken;
210  WordKatakana = ( KatakanaEx )+ >startToken @endToken;
211
212  # UAX#29 WB5.   (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
213  #       WB6.   (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
214  #       WB7.   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
215  #       WB7a.  Hebrew_Letter × Single_Quote
216  #       WB7b.  Hebrew_Letter × Double_Quote Hebrew_Letter
217  #       WB7c.  Hebrew_Letter Double_Quote × Hebrew_Letter
218  #       WB9.   (ALetter | Hebrew_Letter) × Numeric
219  #       WB10.  Numeric × (ALetter | Hebrew_Letter)
220  #       WB13.  Katakana × Katakana
221  #       WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
222  #       WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
223  #
224  # Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a
225  #
226  Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
227                             | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
228                               | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
229                               | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
230                               |ExtendNumLetEx
231                               )+
232                             )
233         (
234          ( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
235                              | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
236                                | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
237                                | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
238                                )+
239                              )
240         )* ExtendNumLetEx*) >startToken @endToken;
241
242  # UAX#29 WB14.  Any ÷ Any
243  WordHan = HanEx >startToken @endToken;
244  WordHiragana = HiraganaEx >startToken @endToken;
245
246  WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star
247
248  WordCRLF = (CR LF) >startToken @endToken;
249
250  WordCR = CR >startToken @endToken;
251
252  WordLF = LF >startToken @endToken;
253
254  WordNL = Newline >startToken @endToken;
255
256  WordRegional = (RegionalIndicatorEx+) >startToken @endToken;
257
258  Other = OtherEx >startToken @endToken;
259
260  main := |*
261    WordNumeric => finishNumericToken;
262    WordHangul => finishHangulToken;
263    WordKatakana => finishKatakanaToken;
264    Word => finishWordToken;
265    WordHan => finishHanToken;
266    WordHiragana => finishHiraganaToken;
267    WordRegional =>finishNoneToken;
268    WordCRLF => finishNoneToken;
269    WordCR => finishNoneToken;
270    WordLF => finishNoneToken;
271    WordNL => finishNoneToken;
272    WordExt => finishNoneToken;
273    Other => finishNoneToken;
274  *|;
275
276    write init;
277    write exec;
278  }%%
279
280  if cs < s_first_final {
281    return val, types, totalConsumed, ParseError
282  }
283
284  return val, types, totalConsumed, nil
285}
286