1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18 // Just the stuff shared between offline table builder and online detector
19 //
20
21 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
22 #define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
23
24 #include "integral_types.h"
25 #include "cld2tablesummary.h"
26
27 namespace CLD2 {
28
29 // Runtime routines for hashing, looking up, and scoring
30 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
31 // Unigrams and bigrams are for CJK languages only, including simplified/
32 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
33 // Zhuang Han characters. Surrounding spaces are not considered.
34 // Quadgrams and octagrams for for non-CJK and include two bits indicating
35 // preceding and trailing spaces (word boundaries).
36
37
38 //----------------------------------------------------------------------------//
39 // Main quantized probability table //
40 //----------------------------------------------------------------------------//
41
42 // Table has 240 eight-byte entries. Each entry has a five-byte array and
43 // a three-byte array of log base 2 probabilities in the range 1..12.
44 // The intended use is to express five or three probabilities in a single-byte
45 // subscript, then decode via this table. These probabilities are
46 // intended to go with an array of five or three language numbers.
47 //
48 // The corresponding language numbers will have to be sorted by descending
49 // probability, then the actual probability subscript chosen to match the
50 // closest available entry in this table.
51 //
52 // Pattern of probability values:
53 // hi 3/4 1/2 1/4 lo hi mid lo
54 // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4
55 // and mid is one of 3/4 1/2 or 1/4.
56 // There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and
57 // lo running 1..hi. Only the first group is used for five-entry lookups.
58 // The mid value in the first group is 1/2, the second group 3/4, and the
59 // third group 1/4. For three-entry lookups, this allows the mid entry to be
60 // somewhat higher or lower than the midpoint, to allow a better match to the
61 // original probabilities.
62 static const int kLgProbV2TblSize = 240;
63 static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
64 1,1,1,1,1, 1,1,1, // [0]
65 2,2,2,1,1, 2,2,1, // [1]
66 2,2,2,2,2, 2,2,2,
67 3,3,2,2,1, 3,2,1, // [3]
68 3,3,3,2,2, 3,3,2,
69 3,3,3,3,3, 3,3,3,
70 4,3,3,2,1, 4,3,1, // [6]
71 4,4,3,3,2, 4,3,2,
72 4,4,4,3,3, 4,4,3,
73 4,4,4,4,4, 4,4,4,
74 5,4,3,2,1, 5,3,1, // [10]
75 5,4,4,3,2, 5,4,2,
76 5,5,4,4,3, 5,4,3,
77 5,5,5,4,4, 5,5,4,
78 5,5,5,5,5, 5,5,5,
79 6,5,4,2,1, 6,4,1, // [15]
80 6,5,4,3,2, 6,4,2,
81 6,5,5,4,3, 6,5,3,
82 6,6,5,5,4, 6,5,4,
83 6,6,6,5,5, 6,6,5,
84 6,6,6,6,6, 6,6,6,
85 7,6,4,3,1, 7,4,1, // [21]
86 7,6,5,3,2, 7,5,2,
87 7,6,5,4,3, 7,5,3,
88 7,6,6,5,4, 7,6,4,
89 7,7,6,6,5, 7,6,5,
90 7,7,7,6,6, 7,7,6,
91 7,7,7,7,7, 7,7,7,
92 8,6,5,3,1, 8,5,1, // [28]
93 8,7,5,4,2, 8,5,2,
94 8,7,6,4,3, 8,6,3,
95 8,7,6,5,4, 8,6,4,
96 8,7,7,6,5, 8,7,5,
97 8,8,7,7,6, 8,7,6,
98 8,8,8,7,7, 8,8,7,
99 8,8,8,8,8, 8,8,8,
100 9,7,5,3,1, 9,5,1, // [36]
101 9,7,6,4,2, 9,6,2,
102 9,8,6,5,3, 9,6,3,
103 9,8,7,5,4, 9,7,4,
104 9,8,7,6,5, 9,7,5,
105 9,8,8,7,6, 9,8,6,
106 9,9,8,8,7, 9,8,7,
107 9,9,9,8,8, 9,9,8,
108 9,9,9,9,9, 9,9,9,
109 10,8,6,3,1, 10,6,1, // [45]
110 10,8,6,4,2, 10,6,2,
111 10,8,7,5,3, 10,7,3,
112 10,9,7,6,4, 10,7,4,
113 10,9,8,6,5, 10,8,5,
114 10,9,8,7,6, 10,8,6,
115 10,9,9,8,7, 10,9,7,
116 10,10,9,9,8, 10,9,8,
117 10,10,10,9,9, 10,10,9,
118 10,10,10,10,10, 10,10,10,
119 11,9,6,4,1, 11,6,1, // [55]
120 11,9,7,4,2, 11,7,2,
121 11,9,7,5,3, 11,7,3,
122 11,9,8,6,4, 11,8,4,
123 11,10,8,7,5, 11,8,5,
124 11,10,9,7,6, 11,9,6,
125 11,10,9,8,7, 11,9,7,
126 11,10,10,9,8, 11,10,8,
127 11,11,10,10,9, 11,10,9,
128 11,11,11,10,10, 11,11,10,
129 11,11,11,11,11, 11,11,11,
130 12,9,7,4,1, 12,7,1, // [66]
131 12,10,7,5,2, 12,7,2,
132 12,10,8,5,3, 12,8,3,
133 12,10,8,6,4, 12,8,4,
134 12,10,9,7,5, 12,9,5,
135 12,11,9,8,6, 12,9,6,
136 12,11,10,8,7, 12,10,7,
137 12,11,10,9,8, 12,10,8,
138 12,11,11,10,9, 12,11,9,
139 12,12,11,11,10, 12,11,10,
140 12,12,12,11,11, 12,12,11,
141 12,12,12,12,12, 12,12,12,
142
143 1,1,1,1,1, 1,1,1,
144 2,2,2,1,1, 2,2,1,
145 2,2,2,2,2, 2,2,2,
146 3,3,2,2,1, 3,3,1,
147 3,3,3,2,2, 3,3,2,
148 3,3,3,3,3, 3,3,3,
149 4,3,3,2,1, 4,3,1,
150 4,4,3,3,2, 4,4,2,
151 4,4,4,3,3, 4,4,3,
152 4,4,4,4,4, 4,4,4,
153 5,4,3,2,1, 5,4,1,
154 5,4,4,3,2, 5,4,2,
155 5,5,4,4,3, 5,5,3,
156 5,5,5,4,4, 5,5,4,
157 5,5,5,5,5, 5,5,5,
158 6,5,4,2,1, 6,5,1,
159 6,5,4,3,2, 6,5,2,
160 6,5,5,4,3, 6,5,3,
161 6,6,5,5,4, 6,6,4,
162 6,6,6,5,5, 6,6,5,
163 6,6,6,6,6, 6,6,6,
164 7,6,4,3,1, 7,6,1,
165 7,6,5,3,2, 7,6,2,
166 7,6,5,4,3, 7,6,3,
167 7,6,6,5,4, 7,6,4,
168 7,7,6,6,5, 7,7,5,
169 7,7,7,6,6, 7,7,6,
170 7,7,7,7,7, 7,7,7,
171 8,6,5,3,1, 8,6,1,
172 8,7,5,4,2, 8,7,2,
173 8,7,6,4,3, 8,7,3,
174 8,7,6,5,4, 8,7,4,
175 8,7,7,6,5, 8,7,5,
176 8,8,7,7,6, 8,8,6,
177 8,8,8,7,7, 8,8,7,
178 8,8,8,8,8, 8,8,8,
179 9,7,5,3,1, 9,7,1,
180 9,7,6,4,2, 9,7,2,
181 9,8,6,5,3, 9,8,3,
182 9,8,7,5,4, 9,8,4,
183 9,8,7,6,5, 9,8,5,
184 9,8,8,7,6, 9,8,6,
185 9,9,8,8,7, 9,9,7,
186 9,9,9,8,8, 9,9,8,
187 9,9,9,9,9, 9,9,9,
188 10,8,6,3,1, 10,8,1,
189 10,8,6,4,2, 10,8,2,
190 10,8,7,5,3, 10,8,3,
191 10,9,7,6,4, 10,9,4,
192 10,9,8,6,5, 10,9,5,
193 10,9,8,7,6, 10,9,6,
194 10,9,9,8,7, 10,9,7,
195 10,10,9,9,8, 10,10,8,
196 10,10,10,9,9, 10,10,9,
197 10,10,10,10,10, 10,10,10,
198 11,9,6,4,1, 11,9,1,
199 11,9,7,4,2, 11,9,2,
200 11,9,7,5,3, 11,9,3,
201 11,9,8,6,4, 11,9,4,
202 11,10,8,7,5, 11,10,5,
203 11,10,9,7,6, 11,10,6,
204 11,10,9,8,7, 11,10,7,
205 11,10,10,9,8, 11,10,8,
206 11,11,10,10,9, 11,11,9,
207 11,11,11,10,10, 11,11,10,
208 11,11,11,11,11, 11,11,11,
209 12,9,7,4,1, 12,9,1,
210 12,10,7,5,2, 12,10,2,
211 12,10,8,5,3, 12,10,3,
212 12,10,8,6,4, 12,10,4,
213 12,10,9,7,5, 12,10,5,
214 12,11,9,8,6, 12,11,6,
215 12,11,10,8,7, 12,11,7,
216 12,11,10,9,8, 12,11,8,
217 12,11,11,10,9, 12,11,9,
218 12,12,11,11,10, 12,12,10,
219 12,12,12,11,11, 12,12,11,
220 12,12,12,12,12, 12,12,12,
221
222 1,1,1,1,1, 1,1,1,
223 2,2,2,1,1, 2,1,1,
224 2,2,2,2,2, 2,2,2,
225 3,3,2,2,1, 3,2,1,
226 3,3,3,2,2, 3,2,2,
227 3,3,3,3,3, 3,3,3,
228 4,3,3,2,1, 4,2,1,
229 4,4,3,3,2, 4,3,2,
230 4,4,4,3,3, 4,3,3,
231 4,4,4,4,4, 4,4,4,
232 5,4,3,2,1, 5,2,1,
233 5,4,4,3,2, 5,3,2,
234 5,5,4,4,3, 5,4,3,
235 5,5,5,4,4, 5,4,4,
236 5,5,5,5,5, 5,5,5,
237 6,5,4,2,1, 6,2,1,
238 6,5,4,3,2, 6,3,2,
239 6,5,5,4,3, 6,4,3,
240 6,6,5,5,4, 6,5,4,
241 6,6,6,5,5, 6,5,5,
242 6,6,6,6,6, 6,6,6,
243 7,6,4,3,1, 7,3,1,
244 7,6,5,3,2, 7,3,2,
245 7,6,5,4,3, 7,4,3,
246 7,6,6,5,4, 7,5,4,
247 7,7,6,6,5, 7,6,5,
248 7,7,7,6,6, 7,6,6,
249 7,7,7,7,7, 7,7,7,
250 8,6,5,3,1, 8,3,1,
251 8,7,5,4,2, 8,4,2,
252 8,7,6,4,3, 8,4,3,
253 8,7,6,5,4, 8,5,4,
254 8,7,7,6,5, 8,6,5,
255 8,8,7,7,6, 8,7,6,
256 8,8,8,7,7, 8,7,7,
257 8,8,8,8,8, 8,8,8,
258 9,7,5,3,1, 9,3,1,
259 9,7,6,4,2, 9,4,2,
260 9,8,6,5,3, 9,5,3,
261 9,8,7,5,4, 9,5,4,
262 9,8,7,6,5, 9,6,5,
263 9,8,8,7,6, 9,7,6,
264 9,9,8,8,7, 9,8,7,
265 9,9,9,8,8, 9,8,8,
266 9,9,9,9,9, 9,9,9,
267 10,8,6,3,1, 10,3,1,
268 10,8,6,4,2, 10,4,2,
269 10,8,7,5,3, 10,5,3,
270 10,9,7,6,4, 10,6,4,
271 10,9,8,6,5, 10,6,5,
272 10,9,8,7,6, 10,7,6,
273 10,9,9,8,7, 10,8,7,
274 10,10,9,9,8, 10,9,8,
275 10,10,10,9,9, 10,9,9,
276 10,10,10,10,10, 10,10,10,
277 11,9,6,4,1, 11,4,1,
278 11,9,7,4,2, 11,4,2,
279 11,9,7,5,3, 11,5,3,
280 11,9,8,6,4, 11,6,4,
281 11,10,8,7,5, 11,7,5,
282 11,10,9,7,6, 11,7,6,
283 11,10,9,8,7, 11,8,7,
284 11,10,10,9,8, 11,9,8,
285 11,11,10,10,9, 11,10,9,
286 11,11,11,10,10, 11,10,10,
287 11,11,11,11,11, 11,11,11,
288 12,9,7,4,1, 12,4,1,
289 12,10,7,5,2, 12,5,2,
290 12,10,8,5,3, 12,5,3,
291 12,10,8,6,4, 12,6,4,
292 12,10,9,7,5, 12,7,5,
293 12,11,9,8,6, 12,8,6,
294 12,11,10,8,7, 12,8,7,
295 12,11,10,9,8, 12,9,8,
296 12,11,11,10,9, 12,10,9,
297 12,12,11,11,10, 12,11,10,
298 12,12,12,11,11, 12,11,11,
299 12,12,12,12,12, 12,12,12,
300
301 // Added 2013.01.28 for CJK compatible mapping
302 8,5,2,2,2, 8,2,2,
303 6,6,6,4,2, 6,6,2,
304 6,5,4,4,4, 6,4,4,
305 6,4,2,2,2, 6,2,2,
306 4,3,2,2,2, 4,2,2,
307 2,2,2,2,2, 2,2,2,
308 };
309
310 // Backmap a single desired probability into an entry in kLgProbV2Tbl
311 static const uint8 kLgProbV2TblBackmap[13] = {
312 0,
313 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
314 };
315
316 // Return address of 8-byte entry[i]
LgProb2TblEntry(int i)317 inline const uint8* LgProb2TblEntry(int i) {
318 return &kLgProbV2Tbl[i * 8];
319 }
320
321 // Return one of three probabilities in an entry
LgProb3(const uint8 * entry,int j)322 inline uint8 LgProb3(const uint8* entry, int j) {
323 return entry[j + 5];
324 }
325
326
327 // Routines to access a hash table of <key:wordhash, value:probs> pairs
328 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
329 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
330 // bucket subscript.
331 // Probs is a packed: three languages plus a subscript for probability table
332 // Buckets have all the keys together, then all the values.Key array never
333 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
334 // Match case may sometimes take an additional cache miss on value access.
335 //
336 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
337 // byte buckets with single cache miss.
338 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
339
340
341 //----------------------------------------------------------------------------//
342 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
343 //----------------------------------------------------------------------------//
344
345 // BIGRAM
346 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
347 // OVERSHOOTS up to 3 bytes
348 // For runtime use of tables
349 // Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p)
350 uint32 BiHashV2(const char* word_ptr, int bytecount);
351
352 // QUADGRAM wrapper with surrounding spaces
353 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
354 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
355 // For runtime use of tables
356 uint32 QuadHashV2(const char* word_ptr, int bytecount);
357
358 // QUADGRAM wrapper with surrounding underscores (offline use)
359 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
360 // OVERSHOOTS up to 3 bytes
361 // For offline construction of tables
362 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount);
363
364 // OCTAGRAM wrapper with surrounding spaces
365 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
366 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
367 uint64 OctaHash40(const char* word_ptr, int bytecount);
368
369
370 // OCTAGRAM wrapper with surrounding underscores (offline use)
371 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
372 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
373 uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
374
375 // Hash a consecutive pair of tokens/words A B
376 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash);
377
378
379 // From 32-bit gram FP, return hash table subscript and remaining key
QuadFPJustHash(uint32 quadhash,uint32 keymask,int bucketcount,uint32 * subscr,uint32 * hashkey)380 inline void QuadFPJustHash(uint32 quadhash,
381 uint32 keymask,
382 int bucketcount,
383 uint32* subscr, uint32* hashkey) {
384 *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
385 *hashkey = quadhash & keymask;
386 }
387
388 // From 40-bit gram FP, return hash table subscript and remaining key
OctaFPJustHash(uint64 longwordhash,uint32 keymask,int bucketcount,uint32 * subscr,uint32 * hashkey)389 inline void OctaFPJustHash(uint64 longwordhash,
390 uint32 keymask,
391 int bucketcount,
392 uint32* subscr, uint32* hashkey) {
393 uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
394 *subscr = temp;
395 temp = longwordhash >> 4;
396 *hashkey = temp & keymask;
397 }
398
399
400 // Look up 32-bit gram FP in caller-passed table
401 // Typical size 256K entries (1.5MB)
402 // Two-byte hashkey
QuadHashV3Lookup4(const CLD2TableSummary * gram_obj,uint32 quadhash)403 inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj,
404 uint32 quadhash) {
405 uint32 subscr, hashkey;
406 const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
407 uint32 keymask = gram_obj->kCLDTableKeyMask;
408 int bucketcount = gram_obj->kCLDTableSize;
409 QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
410 const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
411 // Four-way associative, 4 compares
412 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
413 return bucket_ptr->keyvalue[0];
414 }
415 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
416 return bucket_ptr->keyvalue[1];
417 }
418 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
419 return bucket_ptr->keyvalue[2];
420 }
421 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
422 return bucket_ptr->keyvalue[3];
423 }
424 return 0;
425 }
426
427 // Look up 40-bit gram FP in caller-passed table
428 // Typical size 256K-4M entries (1-16MB)
429 // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
430 // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
OctaHashV3Lookup4(const CLD2TableSummary * gram_obj,uint64 longwordhash)431 inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj,
432 uint64 longwordhash) {
433 uint32 subscr, hashkey;
434 const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
435 uint32 keymask = gram_obj->kCLDTableKeyMask;
436 int bucketcount = gram_obj->kCLDTableSize;
437 OctaFPJustHash(longwordhash, keymask, bucketcount,
438 &subscr, &hashkey);
439 const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
440 // Four-way associative, 4 compares
441 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
442 return bucket_ptr->keyvalue[0];
443 }
444 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
445 return bucket_ptr->keyvalue[1];
446 }
447 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
448 return bucket_ptr->keyvalue[2];
449 }
450 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
451 return bucket_ptr->keyvalue[3];
452 }
453 return 0;
454 }
455
456
457 //----------------------------------------------------------------------------//
458 // Finding groups of 1/2/4/8 letters //
459 //----------------------------------------------------------------------------//
460
461 // Does not advance past space or tab/cr/lf/nul
462 static const uint8 kAdvanceOneCharButSpace[256] = {
463 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
464 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
465 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
466 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
467
468 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
469 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
470 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
471 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
472 };
473
474
475 // Advances *only* on space or ASCII vowel (or illegal byte)
476 static const uint8 kAdvanceOneCharSpaceVowel[256] = {
477 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
478 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
479 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
480 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
481
482 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
483 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
484 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
485 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
486 };
487
488
489 // src points to a letter. Find the byte length of a unigram starting there.
490 int UniLen(const char* src);
491
492 // src points to a letter. Find the byte length of a bigram starting there.
493 int BiLen(const char* src);
494
495 // src points to a letter. Find the byte length of a quadgram starting there.
496 int QuadLen(const char* src);
497
498 // src points to a letter. Find the byte length of an octagram starting there.
499 int OctaLen(const char* src);
500
501 } // End namespace CLD2
502
503 #endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
504
505
506
507
508
509
510