1 /*
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *
5  *  Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
6  *
7  *  This program is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "chewing_large_table.h"
22 #include <assert.h>
23 #include "pinyin_phrase2.h"
24 #include "pinyin_phrase3.h"
25 #include "pinyin_parser2.h"
26 #include "zhuyin_parser2.h"
27 
28 
29 /* internal class definition */
30 
31 namespace pinyin{
32 class ChewingLengthIndexLevel{
33 
34 protected:
35     GArray * m_chewing_array_indexes;
36 
37 public:
38     /* constructor/destructor */
39     ChewingLengthIndexLevel();
40     ~ChewingLengthIndexLevel();
41 
42     /* load/store method */
43     bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
44     bool store(MemoryChunk * new_chunk, table_offset_t offset,
45                table_offset_t & end);
46 
47     /* search method */
48     int search(pinyin_option_t options, int phrase_length,
49                /* in */ const ChewingKey keys[],
50                /* out */ PhraseIndexRanges ranges) const;
51 
52     /* add/remove index method */
53     int add_index(int phrase_length, /* in */ const ChewingKey keys[],
54                   /* in */ phrase_token_t token);
55     int remove_index(int phrase_length, /* in */ const ChewingKey keys[],
56                      /* in */ phrase_token_t token);
57 
58     /* get length method */
59     int get_length() const;
60 
61     /* mask out method */
62     bool mask_out(phrase_token_t mask, phrase_token_t value);
63 };
64 
65 
66 template<size_t phrase_length>
67 class ChewingArrayIndexLevel{
68 protected:
69     typedef PinyinIndexItem2<phrase_length> IndexItem;
70 
71 protected:
72     MemoryChunk m_chunk;
73 
74     /* compress consecutive tokens */
75     int convert(pinyin_option_t options,
76                 const ChewingKey keys[],
77                 IndexItem * begin,
78                 IndexItem * end,
79                 PhraseIndexRanges ranges) const;
80 
81 public:
82     /* load/store method */
83     bool load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end);
84     bool store(MemoryChunk * new_chunk, table_offset_t offset,
85                table_offset_t & end);
86 
87     /* search method */
88     int search(pinyin_option_t options, /* in */const ChewingKey keys[],
89                /* out */ PhraseIndexRanges ranges) const;
90 
91     /* add/remove index method */
92     int add_index(/* in */ const ChewingKey keys[], /* in */ phrase_token_t token);
93     int remove_index(/* in */ const ChewingKey keys[],
94                      /* in */ phrase_token_t token);
95 
96     /* get length method */
97     int get_length() const;
98 
99     /* mask out method */
100     bool mask_out(phrase_token_t mask, phrase_token_t value);
101 };
102 
103 };
104 
105 
106 using namespace pinyin;
107 
108 /* class implementation */
109 
ChewingBitmapIndexLevel(pinyin_option_t options)110 ChewingBitmapIndexLevel::ChewingBitmapIndexLevel(pinyin_option_t options)
111     : m_options(options) {
112     memset(m_chewing_length_indexes, 0, sizeof(m_chewing_length_indexes));
113 }
114 
reset()115 void ChewingBitmapIndexLevel::reset() {
116     for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k)
117         for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
118             for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m)
119                 for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES;
120                      ++n) {
121                     ChewingLengthIndexLevel * & length_array =
122                         m_chewing_length_indexes[k][l][m][n];
123                     if (length_array)
124                         delete length_array;
125                     length_array = NULL;
126                 }
127 }
128 
129 
130 /* search method */
131 
search(int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const132 int ChewingBitmapIndexLevel::search(int phrase_length,
133                                     /* in */ const ChewingKey keys[],
134                                     /* out */ PhraseIndexRanges ranges) const {
135     assert(phrase_length > 0);
136     return initial_level_search(phrase_length, keys, ranges);
137 }
138 
initial_level_search(int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const139 int ChewingBitmapIndexLevel::initial_level_search (int phrase_length,
140     /* in */ const ChewingKey keys[], /* out */ PhraseIndexRanges ranges) const {
141 
142 /* macros */
143 #define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN:                  \
144     {                                                                   \
145         result |= middle_and_final_level_search(ORIGIN, phrase_length,  \
146                                                 keys, ranges);          \
147         if (m_options & AMBIGUITY) {                                    \
148             result |= middle_and_final_level_search(ANOTHER,            \
149                                                     phrase_length,      \
150                                                     keys, ranges);      \
151         }                                                               \
152         return result;                                                  \
153     }
154 
155     /* deal with ambiguities */
156     int result = SEARCH_NONE;
157     const ChewingKey & first_key = keys[0];
158 
159     switch(first_key.m_initial) {
160         MATCH(PINYIN_AMB_C_CH, CHEWING_C, CHEWING_CH);
161         MATCH(PINYIN_AMB_C_CH, CHEWING_CH, CHEWING_C);
162         MATCH(PINYIN_AMB_Z_ZH, CHEWING_Z, CHEWING_ZH);
163         MATCH(PINYIN_AMB_Z_ZH, CHEWING_ZH, CHEWING_Z);
164         MATCH(PINYIN_AMB_S_SH, CHEWING_S, CHEWING_SH);
165         MATCH(PINYIN_AMB_S_SH, CHEWING_SH, CHEWING_S);
166         MATCH(PINYIN_AMB_L_R, CHEWING_R, CHEWING_L);
167         MATCH(PINYIN_AMB_L_N, CHEWING_N, CHEWING_L);
168         MATCH(PINYIN_AMB_F_H, CHEWING_F, CHEWING_H);
169         MATCH(PINYIN_AMB_F_H, CHEWING_H, CHEWING_F);
170         MATCH(PINYIN_AMB_G_K, CHEWING_G, CHEWING_K);
171         MATCH(PINYIN_AMB_G_K, CHEWING_K, CHEWING_G);
172 
173     case CHEWING_L:
174         {
175             result |= middle_and_final_level_search
176                 (CHEWING_L, phrase_length, keys, ranges);
177 
178             if (m_options & PINYIN_AMB_L_N)
179                 result |= middle_and_final_level_search
180                     (CHEWING_N, phrase_length, keys,ranges);
181 
182             if (m_options & PINYIN_AMB_L_R)
183                 result |= middle_and_final_level_search
184                     (CHEWING_R, phrase_length, keys, ranges);
185             return result;
186         }
187     default:
188         {
189             result |= middle_and_final_level_search
190                 ((ChewingInitial) first_key.m_initial,
191                  phrase_length, keys, ranges);
192             return result;
193         }
194     }
195 #undef MATCH
196     return result;
197 }
198 
199 
middle_and_final_level_search(ChewingInitial initial,int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const200 int ChewingBitmapIndexLevel::middle_and_final_level_search
201 (ChewingInitial initial, int phrase_length, /* in */ const ChewingKey keys[],
202  /* out */ PhraseIndexRanges ranges) const {
203 
204 /* macros */
205 #define MATCH(AMBIGUITY, ORIGIN, ANOTHER) case ORIGIN:                  \
206     {                                                                   \
207         result = tone_level_search                                      \
208             (initial, middle,                                           \
209              ORIGIN, phrase_length, keys, ranges);                      \
210         if (m_options & AMBIGUITY) {                                    \
211             result |= tone_level_search                                 \
212                 (initial, middle,                                       \
213                  ANOTHER, phrase_length, keys, ranges);                 \
214         }                                                               \
215         return result;                                                  \
216     }
217 
218     int result = SEARCH_NONE;
219     const ChewingKey & first_key = keys[0];
220     const ChewingMiddle middle = (ChewingMiddle)first_key.m_middle;
221 
222     switch(first_key.m_final) {
223     case CHEWING_ZERO_FINAL:
224         {
225             if (middle == CHEWING_ZERO_MIDDLE) { /* in-complete pinyin */
226                 if (!(m_options & PINYIN_INCOMPLETE))
227                     return result;
228                 for (int m = CHEWING_ZERO_MIDDLE;
229                      m < CHEWING_NUMBER_OF_MIDDLES; ++m)
230                     for (int n = CHEWING_ZERO_FINAL;
231                          n < CHEWING_NUMBER_OF_FINALS; ++n) {
232 
233                         if (CHEWING_ZERO_MIDDLE == m &&
234                             CHEWING_ZERO_FINAL == n)
235                             continue;
236 
237                         result |= tone_level_search
238                             (initial, (ChewingMiddle) m, (ChewingFinal) n,
239                              phrase_length, keys, ranges);
240                     }
241                 return result;
242             } else { /* normal pinyin */
243                 result |= tone_level_search
244                     (initial, middle, CHEWING_ZERO_FINAL,
245                      phrase_length, keys, ranges);
246                 return result;
247             }
248         }
249 
250         MATCH(PINYIN_AMB_AN_ANG, CHEWING_AN, CHEWING_ANG);
251 	MATCH(PINYIN_AMB_AN_ANG, CHEWING_ANG, CHEWING_AN);
252 	MATCH(PINYIN_AMB_EN_ENG, CHEWING_EN, CHEWING_ENG);
253 	MATCH(PINYIN_AMB_EN_ENG, CHEWING_ENG, CHEWING_EN);
254 	MATCH(PINYIN_AMB_IN_ING, PINYIN_IN, PINYIN_ING);
255 	MATCH(PINYIN_AMB_IN_ING, PINYIN_ING, PINYIN_IN);
256 
257     default:
258         {
259             result |= tone_level_search
260                 (initial, middle, (ChewingFinal) first_key.m_final,
261                  phrase_length, keys, ranges);
262             return result;
263         }
264     }
265 #undef MATCH
266     return result;
267 }
268 
269 
tone_level_search(ChewingInitial initial,ChewingMiddle middle,ChewingFinal final,int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const270 int ChewingBitmapIndexLevel::tone_level_search
271 (ChewingInitial initial, ChewingMiddle middle, ChewingFinal final,
272  int phrase_length, /* in */ const ChewingKey keys[],
273  /* out */ PhraseIndexRanges ranges) const {
274 
275     int result = SEARCH_NONE;
276     const ChewingKey & first_key = keys[0];
277 
278     switch (first_key.m_tone) {
279     case CHEWING_ZERO_TONE:
280         {
281             /* deal with zero tone in chewing large table. */
282             for (int i = CHEWING_ZERO_TONE; i < CHEWING_NUMBER_OF_TONES; ++i) {
283                 ChewingLengthIndexLevel * phrases =
284                     m_chewing_length_indexes
285                     [initial][middle][final][(ChewingTone)i];
286                 if (phrases)
287                     result |= phrases->search
288                         (m_options, phrase_length - 1, keys + 1, ranges);
289             }
290             return result;
291         }
292     default:
293         {
294             ChewingLengthIndexLevel * phrases =
295                 m_chewing_length_indexes
296                 [initial][middle][final][CHEWING_ZERO_TONE];
297             if (phrases)
298                 result |= phrases->search
299                     (m_options, phrase_length - 1, keys + 1, ranges);
300 
301             phrases = m_chewing_length_indexes
302                 [initial][middle][final][(ChewingTone) first_key.m_tone];
303             if (phrases)
304                 result |= phrases->search
305                     (m_options, phrase_length - 1, keys + 1, ranges);
306             return result;
307         }
308     }
309     return result;
310 }
311 
312 
ChewingLengthIndexLevel()313 ChewingLengthIndexLevel::ChewingLengthIndexLevel() {
314     m_chewing_array_indexes = g_array_new(FALSE, TRUE, sizeof(void *));
315 }
316 
~ChewingLengthIndexLevel()317 ChewingLengthIndexLevel::~ChewingLengthIndexLevel() {
318 #define CASE(len) case len:                                             \
319     {                                                                   \
320         ChewingArrayIndexLevel<len> * & array = g_array_index           \
321             (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
322         if (array)                                                      \
323             delete array;                                               \
324         array = NULL;                                                   \
325         break;                                                          \
326     }
327 
328     for (guint i = 0; i < m_chewing_array_indexes->len; ++i) {
329         switch (i){
330 	    CASE(0);
331 	    CASE(1);
332 	    CASE(2);
333 	    CASE(3);
334 	    CASE(4);
335 	    CASE(5);
336 	    CASE(6);
337 	    CASE(7);
338 	    CASE(8);
339 	    CASE(9);
340 	    CASE(10);
341 	    CASE(11);
342 	    CASE(12);
343 	    CASE(13);
344 	    CASE(14);
345 	    CASE(15);
346 	default:
347 	    assert(false);
348 	}
349     }
350 #undef CASE
351     g_array_free(m_chewing_array_indexes, TRUE);
352 }
353 
354 
search(pinyin_option_t options,int phrase_length,const ChewingKey keys[],PhraseIndexRanges ranges) const355 int ChewingLengthIndexLevel::search(pinyin_option_t options, int phrase_length,
356                                     /* in */ const ChewingKey keys[],
357                                     /* out */ PhraseIndexRanges ranges) const {
358     int result = SEARCH_NONE;
359     if ((int) m_chewing_array_indexes->len < phrase_length + 1)
360         return result;
361     if ((int) m_chewing_array_indexes->len > phrase_length + 1)
362         result |= SEARCH_CONTINUED;
363 
364 #define CASE(len) case len:                                             \
365     {                                                                   \
366         ChewingArrayIndexLevel<len> * & array = g_array_index           \
367             (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
368         if (!array)                                                     \
369             return result;                                              \
370         result |= array->search(options, keys, ranges);                 \
371         return result;                                                  \
372     }
373 
374     switch (phrase_length) {
375 	CASE(0);
376 	CASE(1);
377 	CASE(2);
378 	CASE(3);
379 	CASE(4);
380 	CASE(5);
381 	CASE(6);
382 	CASE(7);
383 	CASE(8);
384 	CASE(9);
385 	CASE(10);
386 	CASE(11);
387 	CASE(12);
388 	CASE(13);
389 	CASE(14);
390 	CASE(15);
391     default:
392 	assert(false);
393     }
394 
395 #undef CASE
396 }
397 
398 
399 template<size_t phrase_length>
search(pinyin_option_t options,const ChewingKey keys[],PhraseIndexRanges ranges) const400 int ChewingArrayIndexLevel<phrase_length>::search
401 (pinyin_option_t options, /* in */ const ChewingKey keys[],
402  /* out */ PhraseIndexRanges ranges) const {
403     IndexItem * chunk_begin = NULL, * chunk_end = NULL;
404     chunk_begin = (IndexItem *) m_chunk.begin();
405     chunk_end = (IndexItem *) m_chunk.end();
406 
407     /* do the search */
408     ChewingKey left_keys[phrase_length], right_keys[phrase_length];
409     compute_lower_value2(options, keys, left_keys, phrase_length);
410     compute_upper_value2(options, keys, right_keys, phrase_length);
411 
412     IndexItem left(left_keys, -1), right(right_keys, -1);
413 
414     IndexItem * begin = std_lite::lower_bound
415         (chunk_begin, chunk_end, left,
416          phrase_exact_less_than2<phrase_length>);
417     IndexItem * end   = std_lite::upper_bound
418         (chunk_begin, chunk_end, right,
419          phrase_exact_less_than2<phrase_length>);
420 
421     return convert(options, keys, begin, end, ranges);
422 }
423 
424 /* compress consecutive tokens */
425 template<size_t phrase_length>
convert(pinyin_option_t options,const ChewingKey keys[],IndexItem * begin,IndexItem * end,PhraseIndexRanges ranges) const426 int ChewingArrayIndexLevel<phrase_length>::convert
427 (pinyin_option_t options, const ChewingKey keys[],
428  IndexItem * begin, IndexItem * end,
429  PhraseIndexRanges ranges) const {
430     IndexItem * iter = NULL;
431     PhraseIndexRange cursor;
432     GArray * head, * cursor_head = NULL;
433 
434     int result = SEARCH_NONE;
435     /* TODO: check the below code */
436     cursor.m_range_begin = null_token; cursor.m_range_end = null_token;
437     for (iter = begin; iter != end; ++iter) {
438         if (0 != pinyin_compare_with_ambiguities2
439             (options, keys, iter->m_keys, phrase_length))
440             continue;
441 
442         phrase_token_t token = iter->m_token;
443         head = ranges[PHRASE_INDEX_LIBRARY_INDEX(token)];
444         if (NULL == head)
445             continue;
446 
447         result |= SEARCH_OK;
448 
449         if (null_token == cursor.m_range_begin) {
450             cursor.m_range_begin = token;
451             cursor.m_range_end   = token + 1;
452             cursor_head = head;
453         } else if (cursor.m_range_end == token &&
454                    PHRASE_INDEX_LIBRARY_INDEX(cursor.m_range_begin) ==
455                    PHRASE_INDEX_LIBRARY_INDEX(token)) {
456             ++cursor.m_range_end;
457         } else {
458             g_array_append_val(cursor_head, cursor);
459             cursor.m_range_begin = token; cursor.m_range_end = token + 1;
460             cursor_head = head;
461         }
462     }
463 
464     if (null_token == cursor.m_range_begin)
465         return result;
466 
467     g_array_append_val(cursor_head, cursor);
468     return result;
469 }
470 
471 
472 /* add/remove index method */
473 
add_index(int phrase_length,const ChewingKey keys[],phrase_token_t token)474 int ChewingBitmapIndexLevel::add_index(int phrase_length,
475                                        /* in */ const ChewingKey keys[],
476                                        /* in */ phrase_token_t token) {
477     const ChewingKey first_key = keys[0];
478     ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes
479         [first_key.m_initial][first_key.m_middle]
480         [first_key.m_final][first_key.m_tone];
481 
482     if (NULL == length_array) {
483         length_array = new ChewingLengthIndexLevel();
484     }
485 
486     return length_array->add_index(phrase_length - 1, keys + 1, token);
487 }
488 
remove_index(int phrase_length,const ChewingKey keys[],phrase_token_t token)489 int ChewingBitmapIndexLevel::remove_index(int phrase_length,
490                                           /* in */ const ChewingKey keys[],
491                                           /* in */ phrase_token_t token) {
492     const ChewingKey first_key = keys[0];
493     ChewingLengthIndexLevel * & length_array = m_chewing_length_indexes
494         [first_key.m_initial][first_key.m_middle]
495         [first_key.m_final][first_key.m_tone];
496 
497     if (NULL == length_array)
498         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
499 
500     int retval = length_array->remove_index(phrase_length - 1, keys + 1, token);
501 
502     /* remove empty array. */
503     if (0 == length_array->get_length()) {
504         delete length_array;
505         length_array = NULL;
506     }
507 
508     return retval;
509 }
510 
add_index(int phrase_length,const ChewingKey keys[],phrase_token_t token)511 int ChewingLengthIndexLevel::add_index(int phrase_length,
512                                        /* in */ const ChewingKey keys[],
513                                        /* in */ phrase_token_t token) {
514     if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
515         return ERROR_PHRASE_TOO_LONG;
516 
517     if ((int) m_chewing_array_indexes->len <= phrase_length)
518         g_array_set_size(m_chewing_array_indexes, phrase_length + 1);
519 
520 #define CASE(len) case len:                                     \
521     {                                                           \
522         ChewingArrayIndexLevel<len> * & array = g_array_index   \
523             (m_chewing_array_indexes,                           \
524              ChewingArrayIndexLevel<len> *, len);               \
525         if (NULL == array)                                      \
526             array = new ChewingArrayIndexLevel<len>;            \
527         return array->add_index(keys, token);                   \
528     }
529 
530     switch(phrase_length) {
531 	CASE(0);
532 	CASE(1);
533 	CASE(2);
534 	CASE(3);
535 	CASE(4);
536 	CASE(5);
537 	CASE(6);
538 	CASE(7);
539 	CASE(8);
540 	CASE(9);
541 	CASE(10);
542 	CASE(11);
543 	CASE(12);
544 	CASE(13);
545 	CASE(14);
546 	CASE(15);
547     default:
548 	assert(false);
549     }
550 
551 #undef CASE
552 }
553 
remove_index(int phrase_length,const ChewingKey keys[],phrase_token_t token)554 int ChewingLengthIndexLevel::remove_index(int phrase_length,
555                                           /* in */ const ChewingKey keys[],
556                                           /* in */ phrase_token_t token) {
557     if (!(phrase_length + 1 < MAX_PHRASE_LENGTH))
558         return ERROR_PHRASE_TOO_LONG;
559 
560     if ((int) m_chewing_array_indexes->len <= phrase_length)
561         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
562 
563 #define CASE(len) case len:                                     \
564     {                                                           \
565         ChewingArrayIndexLevel<len> * & array = g_array_index   \
566             (m_chewing_array_indexes,                           \
567              ChewingArrayIndexLevel<len> *, len);               \
568         if (NULL == array)                                      \
569             return ERROR_REMOVE_ITEM_DONOT_EXISTS;              \
570         int retval = array->remove_index(keys, token);          \
571                                                                 \
572         /* remove empty array. */                               \
573         if (0 == array->get_length()) {                         \
574             delete array;                                       \
575             array = NULL;                                       \
576                                                                 \
577             /* shrink self array. */                            \
578             g_array_set_size(m_chewing_array_indexes,           \
579                              get_length());                     \
580         }                                                       \
581         return retval;                                          \
582     }
583 
584     switch (phrase_length) {
585 	CASE(0);
586 	CASE(1);
587 	CASE(2);
588 	CASE(3);
589 	CASE(4);
590 	CASE(5);
591 	CASE(6);
592 	CASE(7);
593 	CASE(8);
594 	CASE(9);
595 	CASE(10);
596 	CASE(11);
597 	CASE(12);
598 	CASE(13);
599 	CASE(14);
600 	CASE(15);
601     default:
602 	assert(false);
603     }
604 
605 #undef CASE
606 }
607 
608 template<size_t phrase_length>
add_index(const ChewingKey keys[],phrase_token_t token)609 int ChewingArrayIndexLevel<phrase_length>::add_index
610 (/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) {
611     IndexItem * begin, * end;
612 
613     IndexItem add_elem(keys, token);
614     begin = (IndexItem *) m_chunk.begin();
615     end   = (IndexItem *) m_chunk.end();
616 
617     std_lite::pair<IndexItem *, IndexItem *> range;
618     range = std_lite::equal_range
619         (begin, end, add_elem, phrase_exact_less_than2<phrase_length>);
620 
621     IndexItem * cur_elem;
622     for (cur_elem = range.first;
623          cur_elem != range.second; ++cur_elem) {
624         if (cur_elem->m_token == token)
625             return ERROR_INSERT_ITEM_EXISTS;
626         if (cur_elem->m_token > token)
627             break;
628     }
629 
630     int offset = (cur_elem - begin) * sizeof(IndexItem);
631     m_chunk.insert_content(offset, &add_elem, sizeof(IndexItem));
632     return ERROR_OK;
633 }
634 
635 template<size_t phrase_length>
remove_index(const ChewingKey keys[],phrase_token_t token)636 int ChewingArrayIndexLevel<phrase_length>::remove_index
637 (/* in */ const ChewingKey keys[], /* in */ phrase_token_t token) {
638     IndexItem * begin, * end;
639 
640     IndexItem remove_elem(keys, token);
641     begin = (IndexItem *) m_chunk.begin();
642     end   = (IndexItem *) m_chunk.end();
643 
644     std_lite::pair<IndexItem *, IndexItem *> range;
645     range = std_lite::equal_range
646         (begin, end, remove_elem, phrase_exact_less_than2<phrase_length>);
647 
648     IndexItem * cur_elem;
649     for (cur_elem = range.first;
650          cur_elem != range.second; ++cur_elem) {
651         if (cur_elem->m_token == token)
652             break;
653     }
654 
655     if (cur_elem == range.second)
656         return ERROR_REMOVE_ITEM_DONOT_EXISTS;
657 
658     int offset = (cur_elem - begin) * sizeof(IndexItem);
659     m_chunk.remove_content(offset, sizeof(IndexItem));
660     return ERROR_OK;
661 }
662 
663 
664 /* load text method */
load_text(FILE * infile,TABLE_PHONETIC_TYPE type)665 bool ChewingLargeTable::load_text(FILE * infile, TABLE_PHONETIC_TYPE type) {
666     char pinyin[256];
667     char phrase[256];
668     phrase_token_t token;
669     size_t freq;
670 
671     while (!feof(infile)) {
672         int num = fscanf(infile, "%255s %255s %u %ld",
673                          pinyin, phrase, &token, &freq);
674 
675         if (4 != num)
676             continue;
677 
678         if(feof(infile))
679             break;
680 
681         glong len = g_utf8_strlen(phrase, -1);
682 
683         ChewingKeyVector keys;
684         ChewingKeyRestVector key_rests;
685 
686         keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey));
687         key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
688 
689         switch (type) {
690         case PINYIN_TABLE: {
691             PinyinDirectParser2 parser;
692             pinyin_option_t options = USE_TONE;
693             parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
694             break;
695         }
696 
697         case ZHUYIN_TABLE: {
698             ZhuyinDirectParser2 parser;
699             pinyin_option_t options = USE_TONE | FORCE_TONE;
700             parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
701             break;
702         }
703         };
704 
705         if (len != keys->len) {
706             fprintf(stderr, "ChewingLargeTable::load_text:%s\t%s\t%u\t%ld\n",
707                     pinyin, phrase, token, freq);
708             continue;
709         }
710 
711         add_index(keys->len, (ChewingKey *)keys->data, token);
712 
713         g_array_free(keys, TRUE);
714         g_array_free(key_rests, TRUE);
715     }
716 
717     return true;
718 }
719 
720 
721 /* load/store method */
722 
load(MemoryChunk * chunk,table_offset_t offset,table_offset_t end)723 bool ChewingBitmapIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
724                                    table_offset_t end) {
725     reset();
726     char * begin = (char *) chunk->begin();
727     table_offset_t phrase_begin, phrase_end;
728     table_offset_t * index = (table_offset_t *) (begin + offset);
729     phrase_end = *index;
730 
731     for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k)
732         for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
733             for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m)
734                 for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) {
735                     phrase_begin = phrase_end;
736                     index++;
737                     phrase_end = *index;
738 
739                     if (phrase_begin == phrase_end) /* null pointer */
740                         continue;
741 
742                     /* after reset() all phrases are null pointer. */
743                     ChewingLengthIndexLevel * phrases = new ChewingLengthIndexLevel;
744                     m_chewing_length_indexes[k][l][m][n] = phrases;
745 
746                     phrases->load(chunk, phrase_begin, phrase_end - 1);
747                     assert(phrase_end <= end);
748                     assert(*(begin + phrase_end - 1)  == c_separate);
749                 }
750 
751     offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t);
752     assert(c_separate == *(begin + offset));
753     return true;
754 }
755 
store(MemoryChunk * new_chunk,table_offset_t offset,table_offset_t & end)756 bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk,
757                                     table_offset_t offset,
758                                     table_offset_t & end) {
759     table_offset_t phrase_end;
760     table_offset_t index = offset;
761     offset += (CHEWING_NUMBER_OF_INITIALS * CHEWING_NUMBER_OF_MIDDLES * CHEWING_NUMBER_OF_FINALS * CHEWING_NUMBER_OF_TONES + 1) * sizeof(table_offset_t);
762 
763     /* add '#' */
764     new_chunk->set_content(offset, &c_separate, sizeof(char));
765     offset += sizeof(char);
766     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
767     index += sizeof(table_offset_t);
768 
769     for (int k = 0; k < CHEWING_NUMBER_OF_INITIALS; ++k)
770         for (int l = 0; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
771             for (int m = 0; m < CHEWING_NUMBER_OF_FINALS; ++m)
772                 for (int n = 0; n < CHEWING_NUMBER_OF_TONES; ++n) {
773                     ChewingLengthIndexLevel * phrases =
774                         m_chewing_length_indexes[k][l][m][n];
775 
776                     if (NULL == phrases) { /* null pointer */
777                         new_chunk->set_content(index, &offset,
778                                                sizeof(table_offset_t));
779                         index += sizeof(table_offset_t);
780                         continue;
781                     }
782 
783                     /* has a end '#' */
784                     phrases->store(new_chunk, offset, phrase_end);
785                     offset = phrase_end;
786 
787                     /* add '#' */
788                     new_chunk->set_content(offset, &c_separate, sizeof(char));
789                     offset += sizeof(char);
790                     new_chunk->set_content(index, &offset,
791                                            sizeof(table_offset_t));
792                     index += sizeof(table_offset_t);
793                 }
794 
795     end = offset;
796     return true;
797 }
798 
load(MemoryChunk * chunk,table_offset_t offset,table_offset_t end)799 bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
800                                    table_offset_t end) {
801     char * begin = (char *) chunk->begin();
802     guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */
803     table_offset_t * index = (table_offset_t *)
804         (begin + offset + sizeof(guint32));
805 
806     table_offset_t phrase_begin, phrase_end = *index;
807     g_array_set_size(m_chewing_array_indexes, 0);
808     for (guint32 i = 0; i < nindex; ++i) {
809         phrase_begin = phrase_end;
810         index++;
811         phrase_end = *index;
812 
813         if (phrase_begin == phrase_end) {
814             void * null = NULL;
815             g_array_append_val(m_chewing_array_indexes, null);
816             continue;
817         }
818 
819 #define CASE(len) case len:                                             \
820         {                                                               \
821             ChewingArrayIndexLevel<len> * phrase =                      \
822                 new ChewingArrayIndexLevel<len>;                        \
823             phrase->load(chunk, phrase_begin, phrase_end - 1);          \
824             assert(*(begin + phrase_end - 1) == c_separate);            \
825             assert(phrase_end <= end);                                  \
826             g_array_append_val(m_chewing_array_indexes, phrase);        \
827             break;                                                      \
828         }
829 
830 	switch ( i ){
831 	    CASE(0);
832 	    CASE(1);
833 	    CASE(2);
834 	    CASE(3);
835 	    CASE(4);
836 	    CASE(5);
837 	    CASE(6);
838 	    CASE(7);
839 	    CASE(8);
840 	    CASE(9);
841 	    CASE(10);
842 	    CASE(11);
843 	    CASE(12);
844 	    CASE(13);
845 	    CASE(14);
846 	    CASE(15);
847 	default:
848 	    assert(false);
849 	}
850 
851 #undef CASE
852     }
853 
854     /* check '#' */
855     offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
856     assert(c_separate == *(begin + offset));
857     return true;
858 }
859 
store(MemoryChunk * new_chunk,table_offset_t offset,table_offset_t & end)860 bool ChewingLengthIndexLevel::store(MemoryChunk * new_chunk,
861                                     table_offset_t offset,
862                                     table_offset_t & end) {
863     guint32 nindex = m_chewing_array_indexes->len; /* number of index */
864     new_chunk->set_content(offset, &nindex, sizeof(guint32));
865     table_offset_t index = offset + sizeof(guint32);
866 
867     offset += sizeof(guint32) + (nindex + 1) * sizeof(table_offset_t);
868     new_chunk->set_content(offset, &c_separate, sizeof(char));
869     offset += sizeof(char);
870     new_chunk->set_content(index, &offset, sizeof(table_offset_t));
871     index += sizeof(table_offset_t);
872 
873     table_offset_t phrase_end;
874     for (guint32 i = 0; i < nindex; ++i) {
875 #define CASE(len) case len:                                             \
876         {                                                               \
877             ChewingArrayIndexLevel<len> * phrase = g_array_index        \
878                 (m_chewing_array_indexes, ChewingArrayIndexLevel<len> *, len); \
879             if (NULL == phrase) {                                       \
880                 new_chunk->set_content                                  \
881                     (index, &offset, sizeof(table_offset_t));           \
882                 index += sizeof(table_offset_t);                        \
883                 continue;                                               \
884             }                                                           \
885             phrase->store(new_chunk, offset, phrase_end);               \
886             offset = phrase_end;                                        \
887             break;                                                      \
888         }
889 
890 	switch ( i ){
891 	    CASE(0);
892 	    CASE(1);
893 	    CASE(2);
894 	    CASE(3);
895 	    CASE(4);
896 	    CASE(5);
897 	    CASE(6);
898 	    CASE(7);
899 	    CASE(8);
900 	    CASE(9);
901 	    CASE(10);
902 	    CASE(11);
903 	    CASE(12);
904 	    CASE(13);
905 	    CASE(14);
906 	    CASE(15);
907 	default:
908 	    assert(false);
909 	}
910 #undef CASE
911 
912         /* add '#' */
913         new_chunk->set_content(offset, &c_separate, sizeof(char));
914         offset += sizeof(char);
915         new_chunk->set_content(index, &offset, sizeof(table_offset_t));
916         index += sizeof(table_offset_t);
917     }
918 
919     end = offset;
920     return true;
921 }
922 
923 template<size_t phrase_length>
924 bool ChewingArrayIndexLevel<phrase_length>::
load(MemoryChunk * chunk,table_offset_t offset,table_offset_t end)925 load(MemoryChunk * chunk, table_offset_t offset, table_offset_t end) {
926     char * begin = (char *) chunk->begin();
927     m_chunk.set_chunk(begin + offset, end - offset, NULL);
928     return true;
929 }
930 
931 template<size_t phrase_length>
932 bool ChewingArrayIndexLevel<phrase_length>::
store(MemoryChunk * new_chunk,table_offset_t offset,table_offset_t & end)933 store(MemoryChunk * new_chunk, table_offset_t offset, table_offset_t & end) {
934     new_chunk->set_content(offset, m_chunk.begin(), m_chunk.size());
935     end = offset + m_chunk.size();
936     return true;
937 }
938 
939 
940 /* get length method */
941 
get_length() const942 int ChewingLengthIndexLevel::get_length() const {
943     int length = m_chewing_array_indexes->len;
944 
945     /* trim trailing zero. */
946     for (int i = length - 1; i >= 0; --i) {
947         void * array = g_array_index(m_chewing_array_indexes, void *, i);
948 
949         if (NULL != array)
950             break;
951 
952         --length;
953     }
954 
955     return length;
956 }
957 
958 template<size_t phrase_length>
get_length() const959 int ChewingArrayIndexLevel<phrase_length>::get_length() const {
960     IndexItem * chunk_begin = NULL, * chunk_end = NULL;
961     chunk_begin = (IndexItem *) m_chunk.begin();
962     chunk_end = (IndexItem *) m_chunk.end();
963 
964     return chunk_end - chunk_begin;
965 }
966 
967 
968 /* mask out method */
969 
mask_out(phrase_token_t mask,phrase_token_t value)970 bool ChewingBitmapIndexLevel::mask_out(phrase_token_t mask,
971                                        phrase_token_t value) {
972     for (int k = CHEWING_ZERO_INITIAL; k < CHEWING_NUMBER_OF_INITIALS; ++k)
973         for (int l = CHEWING_ZERO_MIDDLE; l < CHEWING_NUMBER_OF_MIDDLES; ++l)
974             for (int m = CHEWING_ZERO_FINAL; m < CHEWING_NUMBER_OF_FINALS; ++m)
975                 for (int n = CHEWING_ZERO_TONE; n < CHEWING_NUMBER_OF_TONES;
976                      ++n) {
977                     ChewingLengthIndexLevel * & length_array =
978                         m_chewing_length_indexes[k][l][m][n];
979 
980                     if (NULL == length_array)
981                         continue;
982 
983                     length_array->mask_out(mask, value);
984 
985                     if (0 == length_array->get_length()) {
986                         delete length_array;
987                         length_array = NULL;
988                     }
989                 }
990     return true;
991 }
992 
mask_out(phrase_token_t mask,phrase_token_t value)993 bool ChewingLengthIndexLevel::mask_out(phrase_token_t mask,
994                                        phrase_token_t value) {
995 #define CASE(len) case len:                                     \
996     {                                                           \
997         ChewingArrayIndexLevel<len> * & array = g_array_index   \
998             (m_chewing_array_indexes,                           \
999              ChewingArrayIndexLevel<len> *, len);               \
1000                                                                 \
1001         if (NULL == array)                                      \
1002             continue;                                           \
1003                                                                 \
1004         array->mask_out(mask, value);                           \
1005                                                                 \
1006         if (0 == array->get_length()) {                         \
1007             delete array;                                       \
1008             array = NULL;                                       \
1009         }                                                       \
1010         break;                                                  \
1011     }
1012 
1013     for (guint i = 0; i < m_chewing_array_indexes->len; ++i) {
1014         switch (i){
1015 	    CASE(0);
1016 	    CASE(1);
1017 	    CASE(2);
1018 	    CASE(3);
1019 	    CASE(4);
1020 	    CASE(5);
1021 	    CASE(6);
1022 	    CASE(7);
1023 	    CASE(8);
1024 	    CASE(9);
1025 	    CASE(10);
1026 	    CASE(11);
1027 	    CASE(12);
1028 	    CASE(13);
1029 	    CASE(14);
1030 	    CASE(15);
1031 	default:
1032 	    assert(false);
1033         }
1034     }
1035 #undef CASE
1036     g_array_set_size(m_chewing_array_indexes, get_length());
1037     return true;
1038 }
1039 
1040 template<size_t phrase_length>
mask_out(phrase_token_t mask,phrase_token_t value)1041 bool ChewingArrayIndexLevel<phrase_length>::mask_out
1042 (phrase_token_t mask, phrase_token_t value) {
1043     IndexItem * begin = NULL, * end = NULL;
1044     begin = (IndexItem *) m_chunk.begin();
1045     end   = (IndexItem *) m_chunk.end();
1046 
1047     for (IndexItem * cur = begin; cur != end; ++cur) {
1048         if ((cur->m_token & mask) != value)
1049             continue;
1050 
1051         int offset = (cur - begin) * sizeof(IndexItem);
1052         m_chunk.remove_content(offset, sizeof(IndexItem));
1053 
1054         /* update chunk end. */
1055         end = (IndexItem *) m_chunk.end();
1056         --cur;
1057     }
1058 
1059     return true;
1060 }
1061