1 /*
2 * libpinyin
3 * Library to deal with pinyin.
4 *
5 * Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "chewing_large_table.h"
22 #include <assert.h>
23 #include "pinyin_phrase2.h"
24 #include "pinyin_phrase3.h"
25 #include "pinyin_parser2.h"
26 #include "zhuyin_parser2.h"
27
28
29 /* internal class definition */
30
31 namespace pinyin{
32 class ChewingLengthIndexLevel{
33
34 protected:
35 GArray * m_chewing_array_indexes;
36
37 public:
38 /* constructor/destructor */
39 ChewingLengthIndexLevel();
40 ~ChewingLengthIndexLevel();
41
42 /* load/store method */
43 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
44 bool store(MemoryChunk * new_chunk, table_offset_t offset,
45 table_offset_t & end);
46
47 /* search method */
48 int search(pinyin_option_t options, int phrase_length,
49 /* in */ const ChewingKey keys[],
50 /* out */ PhraseIndexRanges ranges) const;
51
52 /* add/remove index method */
53 int add_index(int phrase_length, /* in */ const ChewingKey keys[],
54 /* in */ phrase_token_t token);
55 int remove_index(int phrase_length, /* in */ const ChewingKey keys[],
56 /* in */ phrase_token_t token);
57
58 /* get length method */
59 int get_length() const;
60
61 /* mask out method */
62 bool mask_out(phrase_token_t mask, phrase_token_t value);
63 };
64
65
66 template<size_t phrase_length>
67 class ChewingArrayIndexLevel{
68 protected:
69 typedef PinyinIndexItem2<phrase_length> IndexItem;
70
71 protected:
72 MemoryChunk m_chunk;
73
74 /* compress consecutive tokens */
75 int convert(pinyin_option_t options,
76 const ChewingKey keys[],
77 IndexItem * begin,
78 IndexItem * end,
79 PhraseIndexRanges ranges) const;
80
81 public:
82 /* load/store method */
83 bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
84 bool store(MemoryChunk * new_chunk, table_offset_t offset,
85 table_offset_t & end);
86
87 /* search method */
88 int search(pinyin_option_t options, /* in */const ChewingKey keys[],
89 /* out */ PhraseIndexRanges ranges) const;
90
91 /* add/remove index method */
92 int add_index(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token);
93 int remove_index(/* in */ const ChewingKey keys[],
94 /* in */ phrase_token_t token);
95
96 /* get length method */
97 int get_length() const;
98
99 /* mask out method */
100 bool mask_out(phrase_token_t mask, phrase_token_t value);
101 };
102
103 };
104
105
106 using namespace pinyin;
107
108 /* class implementation */
109
ChewingBitmapIndexLevel(pinyin_option_t options)110 ChewingBitmapIndexLevel::ChewingBitmapIndexLevel(pinyin_option_t options)
111 : m_options(options) {
112 memset(m_chewing_length_indexes, 0, sizeof(m_chewing_length_indexes));
113 }
114
reset()115 void ChewingBitmapIndexLevel::reset() {
116 for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k)
117 for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
118 for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m)
119 for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES;
120 ++n) {
121 ChewingLengthIndexLevel * & length_array =
122 m_chewing_length_indexes[k][l][m][n];
123 if (length_array)
124 delete length_array;
125 length_array = NULL;
126 }
127 }
128
129
130 /* search method */
131
search(int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const132 int ChewingBitmapIndexLevel::search(int phrase_length,
133 /* in */ const ChewingKey keys[],
134 /* out */ PhraseIndexRanges ranges) const {
135 assert(phrase_length > 0);
136 return initial_level_search(phrase_length, keys, ranges);
137 }
138
initial_level_search(int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const139 int ChewingBitmapIndexLevel::initial_level_search (int phrase_length,
140 /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const {
141
142 /* macros */
143 #define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \
144 { \
145 result |= middle_and_final_level_search(ORIGIN, phrase_length, \
146 keys, ranges); \
147 if (m_options & AMBIGUITY) { \
148 result |= middle_and_final_level_search(ANOTHER, \
149 phrase_length, \
150 keys, ranges); \
151 } \
152 return result; \
153 }
154
155 /* deal with ambiguities */
156 int result = SEARCH_NONE;
157 const ChewingKey & first_key = keys[0];
158
159 switch(first_key.m_initial) {
160 MATCH(PINYIN_AMB_C_CH, CHEWING_C, CHEWING_CH);
161 MATCH(PINYIN_AMB_C_CH, CHEWING_CH, CHEWING_C);
162 MATCH(PINYIN_AMB_Z_ZH, CHEWING_Z, CHEWING_ZH);
163 MATCH(PINYIN_AMB_Z_ZH, CHEWING_ZH, CHEWING_Z);
164 MATCH(PINYIN_AMB_S_SH, CHEWING_S, CHEWING_SH);
165 MATCH(PINYIN_AMB_S_SH, CHEWING_SH, CHEWING_S);
166 MATCH(PINYIN_AMB_L_R, CHEWING_R, CHEWING_L);
167 MATCH(PINYIN_AMB_L_N, CHEWING_N, CHEWING_L);
168 MATCH(PINYIN_AMB_F_H, CHEWING_F, CHEWING_H);
169 MATCH(PINYIN_AMB_F_H, CHEWING_H, CHEWING_F);
170 MATCH(PINYIN_AMB_G_K, CHEWING_G, CHEWING_K);
171 MATCH(PINYIN_AMB_G_K, CHEWING_K, CHEWING_G);
172
173 case CHEWING_L:
174 {
175 result |= middle_and_final_level_search
176 (CHEWING_L, phrase_length, keys, ranges);
177
178 if (m_options & PINYIN_AMB_L_N)
179 result |= middle_and_final_level_search
180 (CHEWING_N, phrase_length, keys,ranges);
181
182 if (m_options & PINYIN_AMB_L_R)
183 result |= middle_and_final_level_search
184 (CHEWING_R, phrase_length, keys, ranges);
185 return result;
186 }
187 default:
188 {
189 result |= middle_and_final_level_search
190 ((ChewingInitial) first_key.m_initial,
191 phrase_length, keys, ranges);
192 return result;
193 }
194 }
195 #undef MATCH
196 return result;
197 }
198
199
middle_and_final_level_search(ChewingInitial initial,int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const200 int ChewingBitmapIndexLevel::middle_and_final_level_search
201 (ChewingInitial initial, int phrase_length, /* in */ const ChewingKey keys[],
202 /* out */ PhraseIndexRanges ranges) const {
203
204 /* macros */
205 #define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN: \
206 { \
207 result = tone_level_search \
208 (initial, middle, \
209 ORIGIN, phrase_length, keys, ranges); \
210 if (m_options & AMBIGUITY) { \
211 result |= tone_level_search \
212 (initial, middle, \
213 ANOTHER, phrase_length, keys, ranges); \
214 } \
215 return result; \
216 }
217
218 int result = SEARCH_NONE;
219 const ChewingKey & first_key = keys[0];
220 const ChewingMiddle middle = (ChewingMiddle)first_key.m_middle;
221
222 switch(first_key.m_final) {
223 case CHEWING_ZERO_FINAL:
224 {
225 if (middle == CHEWING_ZERO_MIDDLE) { /* in-complete pinyin */
226 if (!(m_options & PINYIN_INCOMPLETE))
227 return result;
228 for (int m = CHEWING_ZERO_MIDDLE;
229 m < CHEWING_NUMBER_OF_MIDDLES; ++m)
230 for (int n = CHEWING_ZERO_FINAL;
231 n < CHEWING_NUMBER_OF_FINALS; ++n) {
232
233 if (CHEWING_ZERO_MIDDLE == m &&
234 CHEWING_ZERO_FINAL == n)
235 continue;
236
237 result |= tone_level_search
238 (initial, (ChewingMiddle) m, (ChewingFinal) n,
239 phrase_length, keys, ranges);
240 }
241 return result;
242 } else { /* normal pinyin */
243 result |= tone_level_search
244 (initial, middle, CHEWING_ZERO_FINAL,
245 phrase_length, keys, ranges);
246 return result;
247 }
248 }
249
250 MATCH(PINYIN_AMB_AN_ANG, CHEWING_AN, CHEWING_ANG);
251 MATCH(PINYIN_AMB_AN_ANG, CHEWING_ANG, CHEWING_AN);
252 MATCH(PINYIN_AMB_EN_ENG, CHEWING_EN, CHEWING_ENG);
253 MATCH(PINYIN_AMB_EN_ENG, CHEWING_ENG, CHEWING_EN);
254 MATCH(PINYIN_AMB_IN_ING, PINYIN_IN, PINYIN_ING);
255 MATCH(PINYIN_AMB_IN_ING, PINYIN_ING, PINYIN_IN);
256
257 default:
258 {
259 result |= tone_level_search
260 (initial, middle, (ChewingFinal) first_key.m_final,
261 phrase_length, keys, ranges);
262 return result;
263 }
264 }
265 #undef MATCH
266 return result;
267 }
268
269
tone_level_search(ChewingInitial initial,ChewingMiddle middle,ChewingFinal final,int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const270 int ChewingBitmapIndexLevel::tone_level_search
271 (ChewingInitial initial, ChewingMiddle middle, ChewingFinal final,
272 int phrase_length, /* in */ const ChewingKey keys[],
273 /* out */ PhraseIndexRanges ranges) const {
274
275 int result = SEARCH_NONE;
276 const ChewingKey & first_key = keys[0];
277
278 switch (first_key.m_tone) {
279 case CHEWING_ZERO_TONE:
280 {
281 /* deal with zero tone in chewing large table. */
282 for (int i = CHEWING_ZERO_TONE; i < CHEWING_NUMBER_OF_TONES; ++i) {
283 ChewingLengthIndexLevel * phrases =
284 m_chewing_length_indexes
285 [initial][middle][final][(ChewingTone)i];
286 if (phrases)
287 result |= phrases->search
288 (m_options, phrase_length - 1, keys + 1, ranges);
289 }
290 return result;
291 }
292 default:
293 {
294 ChewingLengthIndexLevel * phrases =
295 m_chewing_length_indexes
296 [initial][middle][final][CHEWING_ZERO_TONE];
297 if (phrases)
298 result |= phrases->search
299 (m_options, phrase_length - 1, keys + 1, ranges);
300
301 phrases = m_chewing_length_indexes
302 [initial][middle][final][(ChewingTone) first_key.m_tone];
303 if (phrases)
304 result |= phrases->search
305 (m_options, phrase_length - 1, keys + 1, ranges);
306 return result;
307 }
308 }
309 return result;
310 }
311
312
ChewingLengthIndexLevel()313 ChewingLengthIndexLevel::ChewingLengthIndexLevel() {
314 m_chewing_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
315 }
316
~ChewingLengthIndexLevel()317 ChewingLengthIndexLevel::~ChewingLengthIndexLevel() {
318 #define CASE(len) case len: \
319 { \
320 ChewingArrayIndexLevel<len> * & array = g_array_index \
321 (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
322 if (array) \
323 delete array; \
324 array = NULL; \
325 break; \
326 }
327
328 for (guint i = 0; i < m_chewing_array_indexes->len; ++i) {
329 switch (i){
330 CASE(0);
331 CASE(1);
332 CASE(2);
333 CASE(3);
334 CASE(4);
335 CASE(5);
336 CASE(6);
337 CASE(7);
338 CASE(8);
339 CASE(9);
340 CASE(10);
341 CASE(11);
342 CASE(12);
343 CASE(13);
344 CASE(14);
345 CASE(15);
346 default:
347 assert(false);
348 }
349 }
350 #undef CASE
351 g_array_free(m_chewing_array_indexes, TRUE);
352 }
353
354
search(pinyin_option_t options,int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const355 int ChewingLengthIndexLevel::search(pinyin_option_t options, int phrase_length,
356 /* in */ const ChewingKey keys[],
357 /* out */ PhraseIndexRanges ranges) const {
358 int result = SEARCH_NONE;
359 if ((int) m_chewing_array_indexes->len < phrase_length + 1)
360 return result;
361 if ((int) m_chewing_array_indexes->len > phrase_length + 1)
362 result |= SEARCH_CONTINUED;
363
364 #define CASE(len) case len: \
365 { \
366 ChewingArrayIndexLevel<len> * & array = g_array_index \
367 (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
368 if (!array) \
369 return result; \
370 result |= array->search(options, keys, ranges); \
371 return result; \
372 }
373
374 switch (phrase_length) {
375 CASE(0);
376 CASE(1);
377 CASE(2);
378 CASE(3);
379 CASE(4);
380 CASE(5);
381 CASE(6);
382 CASE(7);
383 CASE(8);
384 CASE(9);
385 CASE(10);
386 CASE(11);
387 CASE(12);
388 CASE(13);
389 CASE(14);
390 CASE(15);
391 default:
392 assert(false);
393 }
394
395 #undef CASE
396 }
397
398
399 template<size_t phrase_length>
search(pinyin_option_t options,const ChewingKey keys[],PhraseIndexRanges ranges) const400 int ChewingArrayIndexLevel<phrase_length>::search
401 (pinyin_option_t options, /* in */ const ChewingKey keys[],
402 /* out */ PhraseIndexRanges ranges) const {
403 IndexItem * chunk_begin = NULL, * chunk_end = NULL;
404 chunk_begin = (IndexItem *) m_chunk.begin();
405 chunk_end = (IndexItem *) m_chunk.end();
406
407 /* do the search */
408 ChewingKey left_keys[phrase_length], right_keys[phrase_length];
409 compute_lower_value2(options, keys, left_keys, phrase_length);
410 compute_upper_value2(options, keys, right_keys, phrase_length);
411
412 IndexItem left(left_keys, -1), right(right_keys, -1);
413
414 IndexItem * begin = std_lite::lower_bound
415 (chunk_begin, chunk_end, left,
416 phrase_exact_less_than2<phrase_length>);
417 IndexItem * end = std_lite::upper_bound
418 (chunk_begin, chunk_end, right,
419 phrase_exact_less_than2<phrase_length>);
420
421 return convert(options, keys, begin, end, ranges);
422 }
423
424 /* compress consecutive tokens */
425 template<size_t phrase_length>
convert(pinyin_option_t options,const ChewingKey keys[],IndexItem * begin,IndexItem * end,PhraseIndexRanges ranges) const426 int ChewingArrayIndexLevel<phrase_length>::convert
427 (pinyin_option_t options, const ChewingKey keys[],
428 IndexItem * begin, IndexItem * end,
429 PhraseIndexRanges ranges) const {
430 IndexItem * iter = NULL;
431 PhraseIndexRange cursor;
432 GArray * head, * cursor_head = NULL;
433
434 int result = SEARCH_NONE;
435 /* TODO: check the below code */
436 cursor.m_range_begin = null_token; cursor.m_range_end = null_token;
437 for (iter = begin; iter != end; ++iter) {
438 if (0 != pinyin_compare_with_ambiguities2
439 (options, keys, iter->m_keys, phrase_length))
440 continue;
441
442 phrase_token_t token = iter->m_token;
443 head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)];
444 if (NULL == head)
445 continue;
446
447 result |= SEARCH_OK;
448
449 if (null_token == cursor.m_range_begin) {
450 cursor.m_range_begin = token;
451 cursor.m_range_end = token + 1;
452 cursor_head = head;
453 } else if (cursor.m_range_end == token &&
454 PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_begin) ==
455 PHRASE_INDEX_LIBRARY_INDEX(token)) {
456 ++cursor.m_range_end;
457 } else {
458 g_array_append_val(cursor_head, cursor);
459 cursor.m_range_begin = token; cursor.m_range_end = token + 1;
460 cursor_head = head;
461 }
462 }
463
464 if (null_token == cursor.m_range_begin)
465 return result;
466
467 g_array_append_val(cursor_head, cursor);
468 return result;
469 }
470
471
472 /* add/remove index method */
473
add_index(int phrase_length,const ChewingKey keys[],phrase_token_t token)474 int ChewingBitmapIndexLevel::add_index(int phrase_length,
475 /* in */ const ChewingKey keys[],
476 /* in */ phrase_token_t token) {
477 const ChewingKey first_key = keys[0];
478 ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes
479 [first_key.m_initial][first_key.m_middle]
480 [first_key.m_final][first_key.m_tone];
481
482 if (NULL == length_array) {
483 length_array = new ChewingLengthIndexLevel();
484 }
485
486 return length_array->add_index(phrase_length - 1, keys + 1, token);
487 }
488
remove_index(int phrase_length,const ChewingKey keys[],phrase_token_t token)489 int ChewingBitmapIndexLevel::remove_index(int phrase_length,
490 /* in */ const ChewingKey keys[],
491 /* in */ phrase_token_t token) {
492 const ChewingKey first_key = keys[0];
493 ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes
494 [first_key.m_initial][first_key.m_middle]
495 [first_key.m_final][first_key.m_tone];
496
497 if (NULL == length_array)
498 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
499
500 int retval = length_array->remove_index(phrase_length - 1, keys + 1, token);
501
502 /* remove empty array. */
503 if (0 == length_array->get_length()) {
504 delete length_array;
505 length_array = NULL;
506 }
507
508 return retval;
509 }
510
add_index(int phrase_length,const ChewingKey keys[],phrase_token_t token)511 int ChewingLengthIndexLevel::add_index(int phrase_length,
512 /* in */ const ChewingKey keys[],
513 /* in */ phrase_token_t token) {
514 if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
515 return ERROR_PHRASE_TOO_LONG;
516
517 if ((int) m_chewing_array_indexes->len <= phrase_length)
518 g_array_set_size(m_chewing_array_indexes, phrase_length + 1);
519
520 #define CASE(len) case len: \
521 { \
522 ChewingArrayIndexLevel<len> * & array = g_array_index \
523 (m_chewing_array_indexes, \
524 ChewingArrayIndexLevel<len> *, len); \
525 if (NULL == array) \
526 array = new ChewingArrayIndexLevel<len>; \
527 return array->add_index(keys, token); \
528 }
529
530 switch(phrase_length) {
531 CASE(0);
532 CASE(1);
533 CASE(2);
534 CASE(3);
535 CASE(4);
536 CASE(5);
537 CASE(6);
538 CASE(7);
539 CASE(8);
540 CASE(9);
541 CASE(10);
542 CASE(11);
543 CASE(12);
544 CASE(13);
545 CASE(14);
546 CASE(15);
547 default:
548 assert(false);
549 }
550
551 #undef CASE
552 }
553
remove_index(int phrase_length,const ChewingKey keys[],phrase_token_t token)554 int ChewingLengthIndexLevel::remove_index(int phrase_length,
555 /* in */ const ChewingKey keys[],
556 /* in */ phrase_token_t token) {
557 if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
558 return ERROR_PHRASE_TOO_LONG;
559
560 if ((int) m_chewing_array_indexes->len <= phrase_length)
561 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
562
563 #define CASE(len) case len: \
564 { \
565 ChewingArrayIndexLevel<len> * & array = g_array_index \
566 (m_chewing_array_indexes, \
567 ChewingArrayIndexLevel<len> *, len); \
568 if (NULL == array) \
569 return ERROR_REMOVE_ITEM_DONOT_EXISTS; \
570 int retval = array->remove_index(keys, token); \
571 \
572 /* remove empty array. */ \
573 if (0 == array->get_length()) { \
574 delete array; \
575 array = NULL; \
576 \
577 /* shrink self array. */ \
578 g_array_set_size(m_chewing_array_indexes, \
579 get_length()); \
580 } \
581 return retval; \
582 }
583
584 switch (phrase_length) {
585 CASE(0);
586 CASE(1);
587 CASE(2);
588 CASE(3);
589 CASE(4);
590 CASE(5);
591 CASE(6);
592 CASE(7);
593 CASE(8);
594 CASE(9);
595 CASE(10);
596 CASE(11);
597 CASE(12);
598 CASE(13);
599 CASE(14);
600 CASE(15);
601 default:
602 assert(false);
603 }
604
605 #undef CASE
606 }
607
608 template<size_t phrase_length>
add_index(const ChewingKey keys[],phrase_token_t token)609 int ChewingArrayIndexLevel<phrase_length>::add_index
610 (/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) {
611 IndexItem * begin, * end;
612
613 IndexItem add_elem(keys, token);
614 begin = (IndexItem *) m_chunk.begin();
615 end = (IndexItem *) m_chunk.end();
616
617 std_lite::pair<IndexItem *, IndexItem *> range;
618 range = std_lite::equal_range
619 (begin, end, add_elem, phrase_exact_less_than2<phrase_length>);
620
621 IndexItem * cur_elem;
622 for (cur_elem = range.first;
623 cur_elem != range.second; ++cur_elem) {
624 if (cur_elem->m_token == token)
625 return ERROR_INSERT_ITEM_EXISTS;
626 if (cur_elem->m_token > token)
627 break;
628 }
629
630 int offset = (cur_elem - begin) * sizeof(IndexItem);
631 m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
632 return ERROR_OK;
633 }
634
635 template<size_t phrase_length>
remove_index(const ChewingKey keys[],phrase_token_t token)636 int ChewingArrayIndexLevel<phrase_length>::remove_index
637 (/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) {
638 IndexItem * begin, * end;
639
640 IndexItem remove_elem(keys, token);
641 begin = (IndexItem *) m_chunk.begin();
642 end = (IndexItem *) m_chunk.end();
643
644 std_lite::pair<IndexItem *, IndexItem *> range;
645 range = std_lite::equal_range
646 (begin, end, remove_elem, phrase_exact_less_than2<phrase_length>);
647
648 IndexItem * cur_elem;
649 for (cur_elem = range.first;
650 cur_elem != range.second; ++cur_elem) {
651 if (cur_elem->m_token == token)
652 break;
653 }
654
655 if (cur_elem == range.second)
656 return ERROR_REMOVE_ITEM_DONOT_EXISTS;
657
658 int offset = (cur_elem - begin) * sizeof(IndexItem);
659 m_chunk.remove_content(offset, sizeof(IndexItem));
660 return ERROR_OK;
661 }
662
663
664 /* load text method */
load_text(FILE * infile,TABLE_PHONETIC_TYPE type)665 bool ChewingLargeTable::load_text(FILE * infile, TABLE_PHONETIC_TYPE type) {
666 char pinyin[256];
667 char phrase[256];
668 phrase_token_t token;
669 size_t freq;
670
671 while (!feof(infile)) {
672 int num = fscanf(infile, "%255s %255s %u %ld",
673 pinyin, phrase, &token, &freq);
674
675 if (4 != num)
676 continue;
677
678 if(feof(infile))
679 break;
680
681 glong len = g_utf8_strlen(phrase, -1);
682
683 ChewingKeyVector keys;
684 ChewingKeyRestVector key_rests;
685
686 keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
687 key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
688
689 switch (type) {
690 case PINYIN_TABLE: {
691 PinyinDirectParser2 parser;
692 pinyin_option_t options = USE_TONE;
693 parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
694 break;
695 }
696
697 case ZHUYIN_TABLE: {
698 ZhuyinDirectParser2 parser;
699 pinyin_option_t options = USE_TONE | FORCE_TONE;
700 parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
701 break;
702 }
703 };
704
705 if (len != keys->len) {
706 fprintf(stderr, "ChewingLargeTable::load_text:%s\t%s\t%u\t%ld\n",
707 pinyin, phrase, token, freq);
708 continue;
709 }
710
711 add_index(keys->len, (ChewingKey *)keys->data, token);
712
713 g_array_free(keys, TRUE);
714 g_array_free(key_rests, TRUE);
715 }
716
717 return true;
718 }
719
720
721 /* load/store method */
722
load(MemoryChunk * chunk,table_offset_t offset,table_offset_t end)723 bool ChewingBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
724 table_offset_t end) {
725 reset();
726 char * begin = (char *) chunk->begin();
727 table_offset_t phrase_begin, phrase_end;
728 table_offset_t * index = (table_offset_t *) (begin + offset);
729 phrase_end = *index;
730
731 for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k)
732 for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
733 for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m)
734 for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) {
735 phrase_begin = phrase_end;
736 index++;
737 phrase_end = *index;
738
739 if (phrase_begin == phrase_end) /* null pointer */
740 continue;
741
742 /* after reset() all phrases are null pointer. */
743 ChewingLengthIndexLevel * phrases = new ChewingLengthIndexLevel;
744 m_chewing_length_indexes[k][l][m][n] = phrases;
745
746 phrases->load(chunk, phrase_begin, phrase_end - 1);
747 assert(phrase_end <= end);
748 assert(*(begin + phrase_end - 1) == c_separate);
749 }
750
751 offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t);
752 assert(c_separate == *(begin + offset));
753 return true;
754 }
755
store(MemoryChunk * new_chunk,table_offset_t offset,table_offset_t & end)756 bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk,
757 table_offset_t offset,
758 table_offset_t & end) {
759 table_offset_t phrase_end;
760 table_offset_t index = offset;
761 offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t);
762
763 /* add '#' */
764 new_chunk->set_content(offset, &c_separate, sizeof(char));
765 offset += sizeof(char);
766 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
767 index += sizeof(table_offset_t);
768
769 for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k)
770 for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
771 for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m)
772 for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) {
773 ChewingLengthIndexLevel * phrases =
774 m_chewing_length_indexes[k][l][m][n];
775
776 if (NULL == phrases) { /* null pointer */
777 new_chunk->set_content(index, &offset,
778 sizeof(table_offset_t));
779 index += sizeof(table_offset_t);
780 continue;
781 }
782
783 /* has a end '#' */
784 phrases->store(new_chunk, offset, phrase_end);
785 offset = phrase_end;
786
787 /* add '#' */
788 new_chunk->set_content(offset, &c_separate, sizeof(char));
789 offset += sizeof(char);
790 new_chunk->set_content(index, &offset,
791 sizeof(table_offset_t));
792 index += sizeof(table_offset_t);
793 }
794
795 end = offset;
796 return true;
797 }
798
load(MemoryChunk * chunk,table_offset_t offset,table_offset_t end)799 bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
800 table_offset_t end) {
801 char * begin = (char *) chunk->begin();
802 guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */
803 table_offset_t * index = (table_offset_t *)
804 (begin + offset + sizeof(guint32));
805
806 table_offset_t phrase_begin, phrase_end = *index;
807 g_array_set_size(m_chewing_array_indexes, 0);
808 for (guint32 i = 0; i < nindex; ++i) {
809 phrase_begin = phrase_end;
810 index++;
811 phrase_end = *index;
812
813 if (phrase_begin == phrase_end) {
814 void * null = NULL;
815 g_array_append_val(m_chewing_array_indexes, null);
816 continue;
817 }
818
819 #define CASE(len) case len: \
820 { \
821 ChewingArrayIndexLevel<len> * phrase = \
822 new ChewingArrayIndexLevel<len>; \
823 phrase->load(chunk, phrase_begin, phrase_end - 1); \
824 assert(*(begin + phrase_end - 1) == c_separate); \
825 assert(phrase_end <= end); \
826 g_array_append_val(m_chewing_array_indexes, phrase); \
827 break; \
828 }
829
830 switch ( i ){
831 CASE(0);
832 CASE(1);
833 CASE(2);
834 CASE(3);
835 CASE(4);
836 CASE(5);
837 CASE(6);
838 CASE(7);
839 CASE(8);
840 CASE(9);
841 CASE(10);
842 CASE(11);
843 CASE(12);
844 CASE(13);
845 CASE(14);
846 CASE(15);
847 default:
848 assert(false);
849 }
850
851 #undef CASE
852 }
853
854 /* check '#' */
855 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
856 assert(c_separate == *(begin + offset));
857 return true;
858 }
859
store(MemoryChunk * new_chunk,table_offset_t offset,table_offset_t & end)860 bool ChewingLengthIndexLevel::store(MemoryChunk * new_chunk,
861 table_offset_t offset,
862 table_offset_t & end) {
863 guint32 nindex = m_chewing_array_indexes->len; /* number of index */
864 new_chunk->set_content(offset, &nindex, sizeof(guint32));
865 table_offset_t index = offset + sizeof(guint32);
866
867 offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
868 new_chunk->set_content(offset, &c_separate, sizeof(char));
869 offset += sizeof(char);
870 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
871 index += sizeof(table_offset_t);
872
873 table_offset_t phrase_end;
874 for (guint32 i = 0; i < nindex; ++i) {
875 #define CASE(len) case len: \
876 { \
877 ChewingArrayIndexLevel<len> * phrase = g_array_index \
878 (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
879 if (NULL == phrase) { \
880 new_chunk->set_content \
881 (index, &offset, sizeof(table_offset_t)); \
882 index += sizeof(table_offset_t); \
883 continue; \
884 } \
885 phrase->store(new_chunk, offset, phrase_end); \
886 offset = phrase_end; \
887 break; \
888 }
889
890 switch ( i ){
891 CASE(0);
892 CASE(1);
893 CASE(2);
894 CASE(3);
895 CASE(4);
896 CASE(5);
897 CASE(6);
898 CASE(7);
899 CASE(8);
900 CASE(9);
901 CASE(10);
902 CASE(11);
903 CASE(12);
904 CASE(13);
905 CASE(14);
906 CASE(15);
907 default:
908 assert(false);
909 }
910 #undef CASE
911
912 /* add '#' */
913 new_chunk->set_content(offset, &c_separate, sizeof(char));
914 offset += sizeof(char);
915 new_chunk->set_content(index, &offset, sizeof(table_offset_t));
916 index += sizeof(table_offset_t);
917 }
918
919 end = offset;
920 return true;
921 }
922
923 template<size_t phrase_length>
924 bool ChewingArrayIndexLevel<phrase_length>::
load(MemoryChunk * chunk,table_offset_t offset,table_offset_t end)925 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end) {
926 char * begin = (char *) chunk->begin();
927 m_chunk.set_chunk(begin + offset, end - offset, NULL);
928 return true;
929 }
930
931 template<size_t phrase_length>
932 bool ChewingArrayIndexLevel<phrase_length>::
store(MemoryChunk * new_chunk,table_offset_t offset,table_offset_t & end)933 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
934 new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
935 end = offset + m_chunk.size();
936 return true;
937 }
938
939
940 /* get length method */
941
get_length() const942 int ChewingLengthIndexLevel::get_length() const {
943 int length = m_chewing_array_indexes->len;
944
945 /* trim trailing zero. */
946 for (int i = length - 1; i >= 0; --i) {
947 void * array = g_array_index(m_chewing_array_indexes, void *, i);
948
949 if (NULL != array)
950 break;
951
952 --length;
953 }
954
955 return length;
956 }
957
958 template<size_t phrase_length>
get_length() const959 int ChewingArrayIndexLevel<phrase_length>::get_length() const {
960 IndexItem * chunk_begin = NULL, * chunk_end = NULL;
961 chunk_begin = (IndexItem *) m_chunk.begin();
962 chunk_end = (IndexItem *) m_chunk.end();
963
964 return chunk_end - chunk_begin;
965 }
966
967
968 /* mask out method */
969
mask_out(phrase_token_t mask,phrase_token_t value)970 bool ChewingBitmapIndexLevel::mask_out(phrase_token_t mask,
971 phrase_token_t value) {
972 for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k)
973 for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
974 for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m)
975 for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES;
976 ++n) {
977 ChewingLengthIndexLevel * & length_array =
978 m_chewing_length_indexes[k][l][m][n];
979
980 if (NULL == length_array)
981 continue;
982
983 length_array->mask_out(mask, value);
984
985 if (0 == length_array->get_length()) {
986 delete length_array;
987 length_array = NULL;
988 }
989 }
990 return true;
991 }
992
mask_out(phrase_token_t mask,phrase_token_t value)993 bool ChewingLengthIndexLevel::mask_out(phrase_token_t mask,
994 phrase_token_t value) {
995 #define CASE(len) case len: \
996 { \
997 ChewingArrayIndexLevel<len> * & array = g_array_index \
998 (m_chewing_array_indexes, \
999 ChewingArrayIndexLevel<len> *, len); \
1000 \
1001 if (NULL == array) \
1002 continue; \
1003 \
1004 array->mask_out(mask, value); \
1005 \
1006 if (0 == array->get_length()) { \
1007 delete array; \
1008 array = NULL; \
1009 } \
1010 break; \
1011 }
1012
1013 for (guint i = 0; i < m_chewing_array_indexes->len; ++i) {
1014 switch (i){
1015 CASE(0);
1016 CASE(1);
1017 CASE(2);
1018 CASE(3);
1019 CASE(4);
1020 CASE(5);
1021 CASE(6);
1022 CASE(7);
1023 CASE(8);
1024 CASE(9);
1025 CASE(10);
1026 CASE(11);
1027 CASE(12);
1028 CASE(13);
1029 CASE(14);
1030 CASE(15);
1031 default:
1032 assert(false);
1033 }
1034 }
1035 #undef CASE
1036 g_array_set_size(m_chewing_array_indexes, get_length());
1037 return true;
1038 }
1039
1040 template<size_t phrase_length>
mask_out(phrase_token_t mask,phrase_token_t value)1041 bool ChewingArrayIndexLevel<phrase_length>::mask_out
1042 (phrase_token_t mask, phrase_token_t value) {
1043 IndexItem * begin = NULL, * end = NULL;
1044 begin = (IndexItem *) m_chunk.begin();
1045 end = (IndexItem *) m_chunk.end();
1046
1047 for (IndexItem * cur = begin; cur != end; ++cur) {
1048 if ((cur->m_token & mask) != value)
1049 continue;
1050
1051 int offset = (cur - begin) * sizeof(IndexItem);
1052 m_chunk.remove_content(offset, sizeof(IndexItem));
1053
1054 /* update chunk end. */
1055 end = (IndexItem *) m_chunk.end();
1056 --cur;
1057 }
1058
1059 return true;
1060 }
1061