1 /*
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *
5  *  Copyright (C) 2017 Peng Wu <alexepico@gmail.com>
6  *
7  *  This program is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "zhuyin.h"
22 #include <stdio.h>
23 #include <unistd.h>
24 #include <glib/gstdio.h>
25 #include "pinyin_internal.h"
26 
27 
28 using namespace pinyin;
29 
30 /* a glue layer for input method integration. */
31 
32 typedef GArray * CandidateVector; /* GArray of lookup_candidate_t */
33 
34 struct _zhuyin_context_t{
35     zhuyin_option_t m_options;
36 
37     /* input parsers. */
38     FullPinyinScheme m_full_pinyin_scheme;
39     FullPinyinParser2 * m_full_pinyin_parser;
40     ZhuyinParser2 * m_chewing_parser;
41 
42     /* default tables. */
43     FacadeChewingTable2 * m_pinyin_table;
44     FacadePhraseTable3 * m_phrase_table;
45     FacadePhraseIndex * m_phrase_index;
46     Bigram * m_system_bigram;
47     Bigram * m_user_bigram;
48 
49     /* lookups. */
50     PhoneticLookup<1, 1> * m_pinyin_lookup;
51     PhraseLookup * m_phrase_lookup;
52 
53     char * m_system_dir;
54     char * m_user_dir;
55     bool m_modified;
56 
57     SystemTableInfo2 m_system_table_info;
58 };
59 
60 struct _zhuyin_instance_t{
61     /* pointer of zhuyin_context_t. */
62     zhuyin_context_t * m_context;
63 
64     /* the tokens of phrases before the user input. */
65     TokenVector m_prefixes;
66 
67     /* cached parsed pinyin keys. */
68     PhoneticKeyMatrix m_matrix;
69     size_t m_parsed_len;
70 
71     /* cached pinyin lookup variables. */
72     ForwardPhoneticConstraints * m_constraints;
73     NBestMatchResults m_nbest_results;
74     TokenVector m_phrase_result;
75     CandidateVector m_candidates;
76 };
77 
78 struct _lookup_candidate_t{
79     lookup_candidate_type_t m_candidate_type;
80     gchar * m_phrase_string;
81     phrase_token_t m_token;
82     guint8 m_phrase_length;
83     guint16 m_begin; /* must contain the preceding "'" character. */
84     guint16 m_end; /* must not contain the following "'" character. */
85     guint32 m_freq; /* the amplifed gfloat numerical value. */
86 
87 public:
_lookup_candidate_t_lookup_candidate_t88     _lookup_candidate_t() {
89         m_candidate_type = NORMAL_CANDIDATE_AFTER_CURSOR;
90         m_phrase_string = NULL;
91         m_token = null_token;
92         m_phrase_length = 0;
93         m_begin = 0; m_end = 0;
94         m_freq = 0;
95     }
96 };
97 
98 struct _import_iterator_t{
99     zhuyin_context_t * m_context;
100     guint8 m_phrase_index;
101 };
102 
_clean_user_files(const char * user_dir,const pinyin_table_info_t * phrase_files)103 static bool _clean_user_files(const char * user_dir,
104                               const pinyin_table_info_t * phrase_files){
105     /* clean up files, if version mis-matches. */
106     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
107         const pinyin_table_info_t * table_info = phrase_files + i;
108 
109         if (NOT_USED == table_info->m_file_type)
110             continue;
111 
112         if (NULL == table_info->m_user_filename)
113             continue;
114 
115         const char * userfilename = table_info->m_user_filename;
116 
117         /* remove dbin file. */
118         gchar * filename = g_build_filename(user_dir, userfilename, NULL);
119         unlink(filename);
120         g_free(filename);
121     }
122 
123     return true;
124 }
125 
check_format(zhuyin_context_t * context)126 static bool check_format(zhuyin_context_t * context){
127     const char * user_dir = context->m_user_dir;
128 
129     UserTableInfo user_table_info;
130     gchar * filename = g_build_filename
131         (user_dir, USER_TABLE_INFO, NULL);
132     user_table_info.load(filename);
133     g_free(filename);
134 
135     bool exists = user_table_info.is_conform
136         (&context->m_system_table_info);
137 
138     if (exists)
139         return exists;
140 
141     const pinyin_table_info_t * phrase_files = NULL;
142 
143     phrase_files = context->m_system_table_info.get_default_tables();
144     _clean_user_files(user_dir, phrase_files);
145 
146     filename = g_build_filename
147         (user_dir, USER_PINYIN_INDEX, NULL);
148     unlink(filename);
149     g_free(filename);
150 
151     filename = g_build_filename
152         (user_dir, USER_PHRASE_INDEX, NULL);
153     unlink(filename);
154     g_free(filename);
155 
156     filename = g_build_filename
157         (user_dir, USER_BIGRAM, NULL);
158     unlink(filename);
159     g_free(filename);
160 
161     return exists;
162 }
163 
mark_version(zhuyin_context_t * context)164 static bool mark_version(zhuyin_context_t * context){
165     const char * userdir = context->m_user_dir;
166 
167     UserTableInfo user_table_info;
168     user_table_info.make_conform(&context->m_system_table_info);
169 
170     gchar * filename = g_build_filename
171         (userdir, USER_TABLE_INFO, NULL);
172     bool retval = user_table_info.save(filename);
173     g_free(filename);
174 
175     return retval;
176 }
177 
_load_phrase_library(const char * system_dir,const char * user_dir,FacadePhraseIndex * phrase_index,const pinyin_table_info_t * table_info)178 static bool _load_phrase_library (const char * system_dir,
179                                   const char * user_dir,
180                                   FacadePhraseIndex * phrase_index,
181                                   const pinyin_table_info_t * table_info){
182     /* check whether the sub phrase index is already loaded. */
183     PhraseIndexRange range;
184     guint8 index = table_info->m_dict_index;
185 
186     int retval = phrase_index->get_range(index, range);
187     if (ERROR_OK == retval)
188         return false;
189 
190     if (SYSTEM_FILE == table_info->m_file_type) {
191         /* system phrase library */
192         MemoryChunk * chunk = new MemoryChunk;
193 
194         const char * systemfilename = table_info->m_system_filename;
195         /* check bin file in system dir. */
196         gchar * chunkfilename = g_build_filename(system_dir,
197                                                  systemfilename, NULL);
198 #ifdef LIBPINYIN_USE_MMAP
199         if (!chunk->mmap(chunkfilename))
200             fprintf(stderr, "mmap %s failed!\n", chunkfilename);
201 #else
202         if (!chunk->load(chunkfilename))
203             fprintf(stderr, "open %s failed!\n", chunkfilename);
204 #endif
205 
206         g_free(chunkfilename);
207 
208         phrase_index->load(index, chunk);
209 
210         const char * userfilename = table_info->m_user_filename;
211 
212         chunkfilename = g_build_filename(user_dir,
213                                          userfilename, NULL);
214 
215         MemoryChunk * log = new MemoryChunk;
216         log->load(chunkfilename);
217         g_free(chunkfilename);
218 
219         /* merge the chunk log. */
220         phrase_index->merge(index, log);
221         return true;
222     }
223 
224     if (DICTIONARY == table_info->m_file_type) {
225         /* addon dictionary. */
226         MemoryChunk * chunk = new MemoryChunk;
227 
228         const char * systemfilename = table_info->m_system_filename;
229         /* check bin file in system dir. */
230         gchar * chunkfilename = g_build_filename(system_dir,
231                                                  systemfilename, NULL);
232 #ifdef LIBPINYIN_USE_MMAP
233         if (!chunk->mmap(chunkfilename))
234             fprintf(stderr, "mmap %s failed!\n", chunkfilename);
235 #else
236         if (!chunk->load(chunkfilename))
237             fprintf(stderr, "open %s failed!\n", chunkfilename);
238 #endif
239 
240         g_free(chunkfilename);
241 
242         phrase_index->load(index, chunk);
243 
244         return true;
245     }
246 
247     if (USER_FILE == table_info->m_file_type) {
248         /* user phrase library */
249         MemoryChunk * chunk = new MemoryChunk;
250         const char * userfilename = table_info->m_user_filename;
251 
252         gchar * chunkfilename = g_build_filename(user_dir,
253                                                  userfilename, NULL);
254 
255         /* check bin file exists. if not, create a new one. */
256         if (chunk->load(chunkfilename)) {
257             phrase_index->load(index, chunk);
258         } else {
259             delete chunk;
260             phrase_index->create_sub_phrase(index);
261         }
262 
263         g_free(chunkfilename);
264         return true;
265     }
266 
267     return false;
268 }
269 
zhuyin_init(const char * systemdir,const char * userdir)270 zhuyin_context_t * zhuyin_init(const char * systemdir, const char * userdir){
271     zhuyin_context_t * context = new zhuyin_context_t;
272 
273     context->m_options = USE_TONE | FORCE_TONE;
274 
275     context->m_system_dir = g_strdup(systemdir);
276     context->m_user_dir = g_strdup(userdir);
277     context->m_modified = false;
278 
279     gchar * filename = g_build_filename
280         (context->m_system_dir, SYSTEM_TABLE_INFO, NULL);
281     if (!context->m_system_table_info.load(filename)) {
282         fprintf(stderr, "load %s failed!\n", filename);
283         return NULL;
284     }
285     g_free(filename);
286 
287 
288     check_format(context);
289 
290     context->m_full_pinyin_scheme = FULL_PINYIN_DEFAULT;
291     context->m_full_pinyin_parser = new FullPinyinParser2;
292     context->m_chewing_parser = new ZhuyinSimpleParser2;
293 
294     /* load chewing table. */
295     context->m_pinyin_table = new FacadeChewingTable2;
296 
297     gchar * system_filename = g_build_filename
298         (context->m_system_dir, SYSTEM_PINYIN_INDEX, NULL);
299     gchar * user_filename = g_build_filename
300         (context->m_user_dir, USER_PINYIN_INDEX, NULL);
301     context->m_pinyin_table->load(system_filename, user_filename);
302     g_free(user_filename);
303     g_free(system_filename);
304 
305     /* load phrase table */
306     context->m_phrase_table = new FacadePhraseTable3;
307 
308     system_filename = g_build_filename
309         (context->m_system_dir, SYSTEM_PHRASE_INDEX, NULL);
310     user_filename = g_build_filename
311         (context->m_user_dir, USER_PHRASE_INDEX, NULL);
312     context->m_phrase_table->load(system_filename, user_filename);
313     g_free(user_filename);
314     g_free(system_filename);
315 
316     context->m_phrase_index = new FacadePhraseIndex;
317 
318     /* load all default tables. */
319     for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
320         const pinyin_table_info_t * phrase_files =
321             context->m_system_table_info.get_default_tables();
322 
323         const pinyin_table_info_t * table_info =
324             phrase_files + i;
325 
326         if (NOT_USED == table_info->m_file_type)
327             continue;
328 
329         /* addon dictionary should not in default tables. */
330         assert(DICTIONARY != table_info->m_file_type);
331 
332         _load_phrase_library(context->m_system_dir, context->m_user_dir,
333                              context->m_phrase_index, table_info);
334     }
335 
336     context->m_system_bigram = new Bigram;
337     filename = g_build_filename(context->m_system_dir, SYSTEM_BIGRAM, NULL);
338     context->m_system_bigram->attach(filename, ATTACH_READONLY);
339     g_free(filename);
340 
341     context->m_user_bigram = new Bigram;
342     filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL);
343     context->m_user_bigram->load_db(filename);
344     g_free(filename);
345 
346     gfloat lambda = context->m_system_table_info.get_lambda();
347 
348     context->m_pinyin_lookup = new PhoneticLookup<1, 1>
349         ( lambda,
350           context->m_pinyin_table, context->m_phrase_index,
351           context->m_system_bigram, context->m_user_bigram);
352 
353     context->m_phrase_lookup = new PhraseLookup
354         (lambda,
355          context->m_phrase_table, context->m_phrase_index,
356          context->m_system_bigram, context->m_user_bigram);
357 
358     return context;
359 }
360 
zhuyin_load_phrase_library(zhuyin_context_t * context,guint8 index)361 bool zhuyin_load_phrase_library(zhuyin_context_t * context,
362                                 guint8 index){
363     if (!(index < PHRASE_INDEX_LIBRARY_COUNT))
364         return false;
365 
366     const pinyin_table_info_t * phrase_files =
367         context->m_system_table_info.get_default_tables();
368     FacadePhraseIndex * phrase_index = context->m_phrase_index;
369     const pinyin_table_info_t * table_info = phrase_files + index;
370 
371     /* Only SYSTEM_FILE or USER_FILE is allowed here. */
372     assert(SYSTEM_FILE == table_info->m_file_type
373            || USER_FILE == table_info->m_file_type);
374 
375     return _load_phrase_library(context->m_system_dir, context->m_user_dir,
376                                 phrase_index, table_info);
377 }
378 
zhuyin_unload_phrase_library(zhuyin_context_t * context,guint8 index)379 bool zhuyin_unload_phrase_library(zhuyin_context_t * context,
380                                   guint8 index){
381     assert(index < PHRASE_INDEX_LIBRARY_COUNT);
382 
383     /* default table. */
384     /* tsi.bin can't be unloaded. */
385     if (TSI_DICTIONARY == index)
386         return false;
387 
388     context->m_phrase_index->unload(index);
389     return true;
390 }
391 
zhuyin_begin_add_phrases(zhuyin_context_t * context,guint8 index)392 import_iterator_t * zhuyin_begin_add_phrases(zhuyin_context_t * context,
393                                              guint8 index){
394     import_iterator_t * iter = new import_iterator_t;
395     iter->m_context = context;
396     iter->m_phrase_index = index;
397     return iter;
398 }
399 
_add_phrase(zhuyin_context_t * context,guint8 index,ChewingKeyVector keys,ucs4_t * phrase,glong phrase_length,gint count)400 static bool _add_phrase(zhuyin_context_t * context,
401                         guint8 index,
402                         ChewingKeyVector keys,
403                         ucs4_t * phrase,
404                         glong phrase_length,
405                         gint count) {
406     /* if -1 == count, use the default value. */
407     const gint default_count = 5;
408     const guint32 unigram_factor = 3;
409     if (-1 == count)
410         count = default_count;
411 
412     FacadePhraseTable3 *  phrase_table = context->m_phrase_table;
413     FacadeChewingTable2 * pinyin_table = context->m_pinyin_table;
414     FacadePhraseIndex * phrase_index = context->m_phrase_index;
415 
416     bool result = false;
417 
418     /* check whether the phrase exists in phrase table */
419     phrase_token_t token = null_token;
420     GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
421 
422     /* do phrase table search. */
423     PhraseTokens tokens;
424     memset(tokens, 0, sizeof(PhraseTokens));
425     phrase_index->prepare_tokens(tokens);
426     int retval = phrase_table->search(phrase_length, phrase, tokens);
427     int num = reduce_tokens(tokens, tokenarray);
428     phrase_index->destroy_tokens(tokens);
429 
430     /* find the best token candidate. */
431     for (size_t i = 0; i < tokenarray->len; ++i) {
432         phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i);
433         if (null_token == token) {
434             token = candidate;
435             continue;
436         }
437 
438         if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == index) {
439             /* only one phrase string per sub phrase index. */
440             assert(PHRASE_INDEX_LIBRARY_INDEX(token) != index);
441             token = candidate;
442             continue;
443         }
444     }
445     g_array_free(tokenarray, TRUE);
446 
447     PhraseItem item;
448     /* check whether it exists in the same sub phrase index; */
449     if (null_token != token &&
450         PHRASE_INDEX_LIBRARY_INDEX(token) == index) {
451         /* if so, remove the phrase, add the pinyin for the phrase item,
452            then add it back;*/
453         phrase_index->get_phrase_item(token, item);
454         assert(phrase_length == item.get_phrase_length());
455         ucs4_t tmp_phrase[MAX_PHRASE_LENGTH];
456         item.get_phrase_string(tmp_phrase);
457         assert(0 == memcmp
458                (phrase, tmp_phrase, sizeof(ucs4_t) * phrase_length));
459 
460         PhraseItem * removed_item = NULL;
461         retval = phrase_index->remove_phrase_item(token, removed_item);
462         if (ERROR_OK == retval) {
463             /* maybe check whether there are duplicated pronunciations here. */
464             removed_item->add_pronunciation((ChewingKey *)keys->data,
465                                             count);
466             phrase_index->add_phrase_item(token, removed_item);
467             delete removed_item;
468             result = true;
469         }
470     } else {
471         /* if not exists in the same sub phrase index,
472            get the maximum token,
473            then add it directly with maximum token + 1; */
474         PhraseIndexRange range;
475         retval = phrase_index->get_range(index, range);
476 
477         if (ERROR_OK == retval) {
478             token = range.m_range_end;
479             if (0x00000000 == (token & PHRASE_MASK))
480                 token++;
481 
482             if (phrase_length == keys->len) { /* valid pinyin */
483                 phrase_table->add_index(phrase_length, phrase, token);
484                 pinyin_table->add_index
485                     (keys->len, (ChewingKey *)(keys->data), token);
486 
487                 item.set_phrase_string(phrase_length, phrase);
488                 item.add_pronunciation((ChewingKey *)(keys->data), count);
489                 phrase_index->add_phrase_item(token, &item);
490                 phrase_index->add_unigram_frequency(token,
491                                                     count * unigram_factor);
492                 result = true;
493             }
494         }
495     }
496 
497     return result;
498 }
499 
zhuyin_iterator_add_phrase(import_iterator_t * iter,const char * phrase,const char * pinyin,gint count)500 bool zhuyin_iterator_add_phrase(import_iterator_t * iter,
501                                 const char * phrase,
502                                 const char * pinyin,
503                                 gint count){
504     zhuyin_context_t * context = iter->m_context;
505     guint8 index = iter->m_phrase_index;
506 
507     bool result = false;
508 
509     if (NULL == phrase || NULL == pinyin)
510         return result;
511 
512     glong phrase_length = 0;
513     ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &phrase_length, NULL);
514 
515     zhuyin_option_t options = USE_TONE | FORCE_TONE;
516     ZhuyinDirectParser2 parser;
517     ChewingKeyVector keys =
518         g_array_new(FALSE, FALSE, sizeof(ChewingKey));
519     ChewingKeyRestVector key_rests =
520         g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
521 
522     /* parse the pinyin. */
523     parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
524 
525     if (phrase_length != keys->len)
526         return result;
527 
528     if (0 == phrase_length || phrase_length >= MAX_PHRASE_LENGTH)
529         return result;
530 
531     result = _add_phrase(context, index, keys,
532                          ucs4_phrase, phrase_length, count);
533 
534     g_array_free(key_rests, TRUE);
535     g_array_free(keys, TRUE);
536     g_free(ucs4_phrase);
537     return result;
538 }
539 
zhuyin_end_add_phrases(import_iterator_t * iter)540 void zhuyin_end_add_phrases(import_iterator_t * iter){
541     /* compact the content memory chunk of phrase index. */
542     iter->m_context->m_phrase_index->compact();
543     iter->m_context->m_modified = true;
544     delete iter;
545 }
546 
zhuyin_save(zhuyin_context_t * context)547 bool zhuyin_save(zhuyin_context_t * context){
548     if (!context->m_user_dir)
549         return false;
550 
551     if (!context->m_modified)
552         return false;
553 
554     context->m_phrase_index->compact();
555 
556     const pinyin_table_info_t * phrase_files =
557         context->m_system_table_info.get_default_tables();
558 
559     /* skip the reserved zero phrase library. */
560     for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
561         PhraseIndexRange range;
562         int retval = context->m_phrase_index->get_range(i, range);
563 
564         if (ERROR_NO_SUB_PHRASE_INDEX == retval)
565             continue;
566 
567         const pinyin_table_info_t * table_info = phrase_files + i;
568 
569         if (NOT_USED == table_info->m_file_type)
570             continue;
571 
572         const char * userfilename = table_info->m_user_filename;
573 
574         if (NULL == userfilename)
575             continue;
576 
577         if (SYSTEM_FILE == table_info->m_file_type ||
578             DICTIONARY == table_info->m_file_type) {
579             /* system phrase library */
580             MemoryChunk * chunk = new MemoryChunk;
581             MemoryChunk * log = new MemoryChunk;
582             const char * systemfilename = table_info->m_system_filename;
583 
584             /* check bin file in system dir. */
585             gchar * chunkfilename = g_build_filename(context->m_system_dir,
586                                                      systemfilename, NULL);
587 #ifdef LIBPINYIN_USE_MMAP
588             if (!chunk->mmap(chunkfilename))
589                 fprintf(stderr, "mmap %s failed!\n", chunkfilename);
590 #else
591             if (!chunk->load(chunkfilename))
592                 fprintf(stderr, "open %s failed!\n", chunkfilename);
593 #endif
594 
595             g_free(chunkfilename);
596             context->m_phrase_index->diff(i, chunk, log);
597 
598             const char * userfilename = table_info->m_user_filename;
599             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
600 
601             gchar * tmppathname = g_build_filename(context->m_user_dir,
602                                                    tmpfilename, NULL);
603             g_free(tmpfilename);
604 
605             gchar * chunkpathname = g_build_filename(context->m_user_dir,
606                                                      userfilename, NULL);
607             log->save(tmppathname);
608 
609             int result = rename(tmppathname, chunkpathname);
610             if (0 != result)
611                 fprintf(stderr, "rename %s to %s failed.\n",
612                         tmppathname, chunkpathname);
613 
614             g_free(chunkpathname);
615             g_free(tmppathname);
616             delete log;
617         }
618 
619         if (USER_FILE == table_info->m_file_type) {
620             /* user phrase library */
621             MemoryChunk * chunk = new MemoryChunk;
622             context->m_phrase_index->store(i, chunk);
623 
624             const char * userfilename = table_info->m_user_filename;
625             gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
626             gchar * tmppathname = g_build_filename(context->m_user_dir,
627                                                    tmpfilename, NULL);
628             g_free(tmpfilename);
629 
630             gchar * chunkpathname = g_build_filename(context->m_user_dir,
631                                                      userfilename, NULL);
632 
633             chunk->save(tmppathname);
634 
635             int result = rename(tmppathname, chunkpathname);
636             if (0 != result)
637                 fprintf(stderr, "rename %s to %s failed.\n",
638                         tmppathname, chunkpathname);
639 
640             g_free(chunkpathname);
641             g_free(tmppathname);
642             delete chunk;
643         }
644     }
645 
646     /* save user pinyin table */
647     gchar * tmpfilename = g_build_filename
648         (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL);
649     unlink(tmpfilename);
650     gchar * filename = g_build_filename
651         (context->m_user_dir, USER_PINYIN_INDEX, NULL);
652 
653     context->m_pinyin_table->store(tmpfilename);
654 
655     int result = rename(tmpfilename, filename);
656     if (0 != result)
657         fprintf(stderr, "rename %s to %s failed.\n",
658                 tmpfilename, filename);
659 
660     g_free(tmpfilename);
661     g_free(filename);
662 
663     /* save user phrase table */
664     tmpfilename = g_build_filename
665         (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL);
666     unlink(tmpfilename);
667     filename = g_build_filename
668         (context->m_user_dir, USER_PHRASE_INDEX, NULL);
669 
670     context->m_phrase_table->store(tmpfilename);
671 
672     result = rename(tmpfilename, filename);
673     if (0 != result)
674         fprintf(stderr, "rename %s to %s failed.\n",
675                 tmpfilename, filename);
676 
677     g_free(tmpfilename);
678     g_free(filename);
679 
680     /* save user bi-gram */
681     tmpfilename = g_build_filename
682         (context->m_user_dir, USER_BIGRAM ".tmp", NULL);
683     unlink(tmpfilename);
684     filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL);
685     context->m_user_bigram->save_db(tmpfilename);
686 
687     result = rename(tmpfilename, filename);
688     if (0 != result)
689         fprintf(stderr, "rename %s to %s failed.\n",
690                 tmpfilename, filename);
691 
692     g_free(tmpfilename);
693     g_free(filename);
694 
695     mark_version(context);
696 
697     context->m_modified = false;
698     return true;
699 }
700 
zhuyin_set_full_pinyin_scheme(zhuyin_context_t * context,FullPinyinScheme scheme)701 bool zhuyin_set_full_pinyin_scheme(zhuyin_context_t * context,
702                                    FullPinyinScheme scheme){
703     context->m_full_pinyin_scheme = scheme;
704     context->m_full_pinyin_parser->set_scheme(scheme);
705     return true;
706 }
707 
zhuyin_set_chewing_scheme(zhuyin_context_t * context,ZhuyinScheme scheme)708 bool zhuyin_set_chewing_scheme(zhuyin_context_t * context,
709                                ZhuyinScheme scheme){
710     delete context->m_chewing_parser;
711     context->m_chewing_parser = NULL;
712 
713     switch(scheme) {
714     case ZHUYIN_STANDARD:
715     case ZHUYIN_IBM:
716     case ZHUYIN_GINYIEH:
717     case ZHUYIN_ETEN:
718     case ZHUYIN_STANDARD_DVORAK: {
719         ZhuyinSimpleParser2 * parser = new ZhuyinSimpleParser2();
720         parser->set_scheme(scheme);
721         context->m_chewing_parser = parser;
722         break;
723     }
724     case ZHUYIN_HSU:
725     case ZHUYIN_ETEN26:
726     case ZHUYIN_HSU_DVORAK: {
727         ZhuyinDiscreteParser2 * parser = new ZhuyinDiscreteParser2();
728         parser->set_scheme(scheme);
729         context->m_chewing_parser = parser;
730         break;
731     }
732     case ZHUYIN_DACHEN_CP26:
733         context->m_chewing_parser = new ZhuyinDaChenCP26Parser2();
734         break;
735     default:
736         assert(FALSE);
737     }
738     return true;
739 }
740 
zhuyin_fini(zhuyin_context_t * context)741 void zhuyin_fini(zhuyin_context_t * context){
742     delete context->m_full_pinyin_parser;
743     delete context->m_chewing_parser;
744     delete context->m_pinyin_table;
745     delete context->m_phrase_table;
746     delete context->m_phrase_index;
747     delete context->m_system_bigram;
748     delete context->m_user_bigram;
749     delete context->m_pinyin_lookup;
750     delete context->m_phrase_lookup;
751 
752     g_free(context->m_system_dir);
753     g_free(context->m_user_dir);
754     context->m_modified = false;
755 
756     delete context;
757 }
758 
zhuyin_mask_out(zhuyin_context_t * context,phrase_token_t mask,phrase_token_t value)759 bool zhuyin_mask_out(zhuyin_context_t * context,
760                      phrase_token_t mask,
761                      phrase_token_t value) {
762 
763     context->m_pinyin_table->mask_out(mask, value);
764     context->m_phrase_table->mask_out(mask, value);
765     context->m_user_bigram->mask_out(mask, value);
766 
767     const pinyin_table_info_t * phrase_files =
768         context->m_system_table_info.get_default_tables();
769 
770     /* mask out the phrase index. */
771     for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
772         PhraseIndexRange range;
773         int retval = context->m_phrase_index->get_range(index, range);
774 
775         if (ERROR_NO_SUB_PHRASE_INDEX == retval)
776             continue;
777 
778         const pinyin_table_info_t * table_info = phrase_files + index;
779 
780         if (NOT_USED == table_info->m_file_type)
781             continue;
782 
783         const char * userfilename = table_info->m_user_filename;
784 
785         if (NULL == userfilename)
786             continue;
787 
788         if (SYSTEM_FILE == table_info->m_file_type ||
789             DICTIONARY == table_info->m_file_type) {
790             /* system phrase library */
791             MemoryChunk * chunk = new MemoryChunk;
792 
793             const char * systemfilename = table_info->m_system_filename;
794             /* check bin file in system dir. */
795             gchar * chunkfilename = g_build_filename(context->m_system_dir,
796                                                      systemfilename, NULL);
797 
798 #ifdef LIBPINYIN_USE_MMAP
799             if (!chunk->mmap(chunkfilename))
800                 fprintf(stderr, "mmap %s failed!\n", chunkfilename);
801 #else
802             if (!chunk->load(chunkfilename))
803                 fprintf(stderr, "open %s failed!\n", chunkfilename);
804 #endif
805 
806             g_free(chunkfilename);
807 
808             context->m_phrase_index->load(index, chunk);
809 
810             const char * userfilename = table_info->m_user_filename;
811 
812             chunkfilename = g_build_filename(context->m_user_dir,
813                                              userfilename, NULL);
814 
815             MemoryChunk * log = new MemoryChunk;
816             log->load(chunkfilename);
817             g_free(chunkfilename);
818 
819             /* merge the chunk log with mask. */
820             context->m_phrase_index->merge_with_mask(index, log, mask, value);
821         }
822 
823         if (USER_FILE == table_info->m_file_type) {
824             /* user phrase library */
825             context->m_phrase_index->mask_out(index, mask, value);
826         }
827     }
828 
829     context->m_phrase_index->compact();
830     return true;
831 }
832 
833 /* copy from options to context->m_options. */
zhuyin_set_options(zhuyin_context_t * context,zhuyin_option_t options)834 bool zhuyin_set_options(zhuyin_context_t * context,
835                         zhuyin_option_t options){
836     context->m_options = options;
837 #if 0
838     context->m_pinyin_table->set_options(context->m_options);
839     context->m_pinyin_lookup->set_options(context->m_options);
840 #endif
841     return true;
842 }
843 
844 
zhuyin_alloc_instance(zhuyin_context_t * context)845 zhuyin_instance_t * zhuyin_alloc_instance(zhuyin_context_t * context){
846     zhuyin_instance_t * instance = new zhuyin_instance_t;
847     instance->m_context = context;
848 
849     instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
850 
851     instance->m_parsed_len = 0;
852 
853     instance->m_constraints = new ForwardPhoneticConstraints
854         (context->m_phrase_index);
855 
856     instance->m_phrase_result = g_array_new
857         (TRUE, TRUE, sizeof(phrase_token_t));
858     instance->m_candidates =
859         g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
860 
861     return instance;
862 }
863 
_free_candidates(CandidateVector candidates)864 static bool _free_candidates(CandidateVector candidates) {
865     /* free candidates */
866     for (size_t i = 0; i < candidates->len; ++i) {
867         lookup_candidate_t * candidate = &g_array_index
868             (candidates, lookup_candidate_t, i);
869         g_free(candidate->m_phrase_string);
870     }
871     g_array_set_size(candidates, 0);
872 
873     return true;
874 }
875 
zhuyin_free_instance(zhuyin_instance_t * instance)876 void zhuyin_free_instance(zhuyin_instance_t * instance){
877     g_array_free(instance->m_prefixes, TRUE);
878     delete instance->m_constraints;
879     g_array_free(instance->m_phrase_result, TRUE);
880     _free_candidates(instance->m_candidates);
881     g_array_free(instance->m_candidates, TRUE);
882 
883     delete instance;
884 }
885 
zhuyin_update_constraints(zhuyin_instance_t * instance)886 static bool zhuyin_update_constraints(zhuyin_instance_t * instance){
887     PhoneticKeyMatrix & matrix = instance->m_matrix;
888     ForwardPhoneticConstraints * constraints = instance->m_constraints;
889 
890     constraints->validate_constraint(&matrix);
891 
892     return true;
893 }
894 
zhuyin_guess_sentence(zhuyin_instance_t * instance)895 bool zhuyin_guess_sentence(zhuyin_instance_t * instance){
896     zhuyin_context_t * & context = instance->m_context;
897     PhoneticKeyMatrix & matrix = instance->m_matrix;
898 
899     g_array_set_size(instance->m_prefixes, 0);
900     g_array_append_val(instance->m_prefixes, sentence_start);
901 
902     zhuyin_update_constraints(instance);
903     bool retval = context->m_pinyin_lookup->get_nbest_match
904         (instance->m_prefixes,
905          &matrix,
906          instance->m_constraints,
907          &instance->m_nbest_results);
908 
909     return retval;
910 }
911 
_compute_prefixes(zhuyin_instance_t * instance,const char * prefix)912 static void _compute_prefixes(zhuyin_instance_t * instance,
913                               const char * prefix){
914     zhuyin_context_t * & context = instance->m_context;
915     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
916 
917     glong len_str = 0;
918     ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL);
919     GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
920 
921     if (ucs4_str && len_str) {
922         /* add prefixes. */
923         for (ssize_t i = 1; i <= len_str; ++i) {
924             if (i > MAX_PHRASE_LENGTH)
925                 break;
926 
927             ucs4_t * start = ucs4_str + len_str - i;
928 
929             PhraseTokens tokens;
930             memset(tokens, 0, sizeof(tokens));
931             phrase_index->prepare_tokens(tokens);
932             int result = context->m_phrase_table->search(i, start, tokens);
933             int num = reduce_tokens(tokens, tokenarray);
934             phrase_index->destroy_tokens(tokens);
935 
936             if (result & SEARCH_OK)
937                 g_array_append_vals(instance->m_prefixes,
938                                     tokenarray->data, tokenarray->len);
939         }
940     }
941     g_array_free(tokenarray, TRUE);
942     g_free(ucs4_str);
943 }
944 
zhuyin_guess_sentence_with_prefix(zhuyin_instance_t * instance,const char * prefix)945 bool zhuyin_guess_sentence_with_prefix(zhuyin_instance_t * instance,
946                                        const char * prefix){
947     zhuyin_context_t * & context = instance->m_context;
948     PhoneticKeyMatrix & matrix = instance->m_matrix;
949 
950     g_array_set_size(instance->m_prefixes, 0);
951     g_array_append_val(instance->m_prefixes, sentence_start);
952 
953     _compute_prefixes(instance, prefix);
954 
955     zhuyin_update_constraints(instance);
956     bool retval = context->m_pinyin_lookup->get_nbest_match
957         (instance->m_prefixes,
958          &matrix,
959          instance->m_constraints,
960          &instance->m_nbest_results);
961 
962     return retval;
963 }
964 
zhuyin_phrase_segment(zhuyin_instance_t * instance,const char * sentence)965 bool zhuyin_phrase_segment(zhuyin_instance_t * instance,
966                            const char * sentence){
967     zhuyin_context_t * & context = instance->m_context;
968 
969     const glong num_of_chars = g_utf8_strlen(sentence, -1);
970     glong ucs4_len = 0;
971     ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
972 
973     g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
974 
975     bool retval = context->m_phrase_lookup->get_best_match
976         (ucs4_len, ucs4_str, instance->m_phrase_result);
977 
978     g_free(ucs4_str);
979     return retval;
980 }
981 
982 /* the returned sentence should be freed by g_free(). */
zhuyin_get_sentence(zhuyin_instance_t * instance,char ** sentence)983 bool zhuyin_get_sentence(zhuyin_instance_t * instance,
984                          char ** sentence){
985     zhuyin_context_t * & context = instance->m_context;
986     NBestMatchResults & results = instance->m_nbest_results;
987 
988     if (0 == results.size())
989         return false;
990 
991     MatchResult result = NULL;
992     assert(results.get_result(0, result));
993 
994     bool retval = pinyin::convert_to_utf8
995         (context->m_phrase_index, result,
996          NULL, false, *sentence);
997 
998     return retval;
999 }
1000 
zhuyin_parse_full_pinyin(zhuyin_instance_t * instance,const char * onepinyin,ChewingKey * onekey)1001 bool zhuyin_parse_full_pinyin(zhuyin_instance_t * instance,
1002                               const char * onepinyin,
1003                               ChewingKey * onekey){
1004     zhuyin_context_t * & context = instance->m_context;
1005     zhuyin_option_t options = context->m_options;
1006 
1007     /* disable the pinyin correction options. */
1008     options &= ~PINYIN_CORRECT_ALL;
1009 
1010     int pinyin_len = strlen(onepinyin);
1011     bool retval = context->m_full_pinyin_parser->parse_one_key
1012         (options, *onekey, onepinyin, pinyin_len);
1013     return retval;
1014 }
1015 
zhuyin_parse_more_full_pinyins(zhuyin_instance_t * instance,const char * pinyins)1016 size_t zhuyin_parse_more_full_pinyins(zhuyin_instance_t * instance,
1017                                       const char * pinyins){
1018     zhuyin_context_t * & context = instance->m_context;
1019     zhuyin_option_t options = context->m_options;
1020     PhoneticKeyMatrix & matrix = instance->m_matrix;
1021 
1022     /* disable the pinyin correction options. */
1023     options &= ~PINYIN_CORRECT_ALL;
1024 
1025     ChewingKeyVector keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey));
1026     ChewingKeyRestVector key_rests =
1027         g_array_new(TRUE, TRUE, sizeof(ChewingKeyRest));
1028 
1029     int parsed_len = context->m_full_pinyin_parser->parse
1030         (options, keys,
1031          key_rests, pinyins, strlen(pinyins));
1032 
1033     instance->m_parsed_len = parsed_len;
1034 
1035     fill_matrix(&matrix, keys, key_rests, parsed_len);
1036 
1037     fuzzy_syllable_step(options, &matrix);
1038 
1039     g_array_free(key_rests, TRUE);
1040     g_array_free(keys, TRUE);
1041     return parsed_len;
1042 }
1043 
zhuyin_parse_chewing(zhuyin_instance_t * instance,const char * onechewing,ChewingKey * onekey)1044 bool zhuyin_parse_chewing(zhuyin_instance_t * instance,
1045                           const char * onechewing,
1046                           ChewingKey * onekey){
1047     zhuyin_context_t * & context = instance->m_context;
1048     zhuyin_option_t options = context->m_options;
1049 
1050     int chewing_len = strlen(onechewing);
1051     bool retval = context->m_chewing_parser->parse_one_key
1052         (options, *onekey, onechewing, chewing_len );
1053     return retval;
1054 }
1055 
zhuyin_parse_more_chewings(zhuyin_instance_t * instance,const char * chewings)1056 size_t zhuyin_parse_more_chewings(zhuyin_instance_t * instance,
1057                                   const char * chewings){
1058     zhuyin_context_t * & context = instance->m_context;
1059     zhuyin_option_t options = context->m_options;
1060     PhoneticKeyMatrix & matrix = instance->m_matrix;
1061 
1062     ChewingKeyVector keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey));
1063     ChewingKeyRestVector key_rests =
1064         g_array_new(TRUE, TRUE, sizeof(ChewingKeyRest));
1065 
1066     int parsed_len = context->m_chewing_parser->parse
1067         (options, keys,
1068          key_rests, chewings, strlen(chewings));
1069 
1070     instance->m_parsed_len = parsed_len;
1071 
1072     fill_matrix(&matrix, keys, key_rests, parsed_len);
1073 
1074     fuzzy_syllable_step(options, &matrix);
1075 
1076     g_array_free(key_rests, TRUE);
1077     g_array_free(keys, TRUE);
1078     return parsed_len;
1079 }
1080 
zhuyin_get_parsed_input_length(zhuyin_instance_t * instance)1081 size_t zhuyin_get_parsed_input_length(zhuyin_instance_t * instance) {
1082     return instance->m_parsed_len;
1083 }
1084 
zhuyin_in_chewing_keyboard(zhuyin_instance_t * instance,const char key,gchar *** symbols)1085 bool zhuyin_in_chewing_keyboard(zhuyin_instance_t * instance,
1086                                 const char key, gchar *** symbols) {
1087     zhuyin_context_t * & context = instance->m_context;
1088     zhuyin_option_t options = context->m_options;
1089 
1090     return context->m_chewing_parser->in_chewing_scheme
1091         (options, key, *symbols);
1092 }
1093 
_token_get_phrase(FacadePhraseIndex * phrase_index,phrase_token_t token,guint * len,gchar ** utf8_str)1094 static bool _token_get_phrase(FacadePhraseIndex * phrase_index,
1095                               phrase_token_t token,
1096                               guint * len,
1097                               gchar ** utf8_str) {
1098     PhraseItem item;
1099     ucs4_t buffer[MAX_PHRASE_LENGTH];
1100 
1101     int retval = phrase_index->get_phrase_item(token, item);
1102     if (ERROR_OK != retval)
1103         return false;
1104 
1105     item.get_phrase_string(buffer);
1106     guint length = item.get_phrase_length();
1107     if (len)
1108         *len = length;
1109     if (utf8_str)
1110         *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1111     return true;
1112 }
1113 
1114 #if 0
1115 static gint compare_item_with_token(gconstpointer lhs,
1116                                     gconstpointer rhs) {
1117     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
1118     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
1119 
1120     phrase_token_t token_lhs = item_lhs->m_token;
1121     phrase_token_t token_rhs = item_rhs->m_token;
1122 
1123     return (token_lhs - token_rhs);
1124 }
1125 #endif
1126 
compare_item_with_length_and_frequency(gconstpointer lhs,gconstpointer rhs)1127 static gint compare_item_with_length_and_frequency(gconstpointer lhs,
1128                                                    gconstpointer rhs) {
1129     lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
1130     lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
1131 
1132     guint8 len_lhs = item_lhs->m_phrase_length;
1133     guint8 len_rhs = item_rhs->m_phrase_length;
1134 
1135     if (len_lhs != len_rhs)
1136         return -(len_lhs - len_rhs); /* in descendant order */
1137 
1138     guint32 freq_lhs = item_lhs->m_freq;
1139     guint32 freq_rhs = item_rhs->m_freq;
1140 
1141     return -(freq_lhs - freq_rhs); /* in descendant order */
1142 }
1143 
_get_previous_token(zhuyin_instance_t * instance,size_t offset)1144 static phrase_token_t _get_previous_token(zhuyin_instance_t * instance,
1145                                           size_t offset) {
1146     zhuyin_context_t * context = instance->m_context;
1147     TokenVector prefixes = instance->m_prefixes;
1148     NBestMatchResults & results = instance->m_nbest_results;
1149 
1150     phrase_token_t prev_token = null_token;
1151     ssize_t i;
1152 
1153     if (0 == offset) {
1154         /* get previous token from prefixes. */
1155         prev_token = sentence_start;
1156         size_t prev_token_len = 0;
1157 
1158         PhraseItem item;
1159         for (size_t i = 0; i < prefixes->len; ++i) {
1160             phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
1161             if (sentence_start == token)
1162                 continue;
1163 
1164             int retval = context->m_phrase_index->get_phrase_item(token, item);
1165             if (ERROR_OK == retval) {
1166                 size_t token_len = item.get_phrase_length();
1167                 if (token_len > prev_token_len) {
1168                     /* found longer match, and save it. */
1169                     prev_token = token;
1170                     prev_token_len = token_len;
1171                 }
1172             }
1173         }
1174     } else {
1175         /* get previous token from match results. */
1176         assert (0 < offset);
1177 
1178         /* no nbest match result. */
1179         if (0 == results.size())
1180             return prev_token;
1181 
1182         /* use the first candidate. */
1183         MatchResult result = NULL;
1184         assert(results.get_result(0, result));
1185 
1186         phrase_token_t cur_token = g_array_index
1187             (result, phrase_token_t, offset);
1188         if (null_token != cur_token) {
1189             for (i = offset - 1; i >= 0; --i) {
1190                 cur_token = g_array_index(result, phrase_token_t, i);
1191                 if (null_token != cur_token) {
1192                     prev_token = cur_token;
1193                     break;
1194                 }
1195             }
1196         }
1197     }
1198 
1199     return prev_token;
1200 }
1201 
_append_items(PhraseIndexRanges ranges,lookup_candidate_t * template_item,CandidateVector items)1202 static void _append_items(PhraseIndexRanges ranges,
1203                           lookup_candidate_t * template_item,
1204                           CandidateVector items) {
1205     /* reduce and append to a single GArray. */
1206     for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
1207         if (NULL == ranges[m])
1208             continue;
1209 
1210         for (size_t n = 0; n < ranges[m]->len; ++n) {
1211             PhraseIndexRange * range =
1212                 &g_array_index(ranges[m], PhraseIndexRange, n);
1213             for (size_t k = range->m_range_begin;
1214                  k < range->m_range_end; ++k) {
1215                 lookup_candidate_t item;
1216                 item.m_candidate_type = template_item->m_candidate_type;
1217                 item.m_token = k;
1218                 item.m_begin = template_item->m_begin;
1219                 item.m_end = template_item->m_end;
1220                 item.m_freq = template_item->m_freq;
1221                 g_array_append_val(items, item);
1222             }
1223         }
1224     }
1225 }
1226 
_compute_frequency_of_items(zhuyin_context_t * context,phrase_token_t prev_token,SingleGram * merged_gram,CandidateVector items)1227 static void _compute_frequency_of_items(zhuyin_context_t * context,
1228                                         phrase_token_t prev_token,
1229                                         SingleGram * merged_gram,
1230                                         CandidateVector items) {
1231     pinyin_option_t & options = context->m_options;
1232     ssize_t i;
1233 
1234     PhraseItem cached_item;
1235     /* compute all freqs. */
1236     for (i = 0; i < items->len; ++i) {
1237         lookup_candidate_t * item = &g_array_index
1238             (items, lookup_candidate_t, i);
1239         phrase_token_t & token = item->m_token;
1240 
1241         gfloat bigram_poss = 0; guint32 total_freq = 0;
1242 
1243         gfloat lambda = context->m_system_table_info.get_lambda();
1244 
1245         if (options & DYNAMIC_ADJUST) {
1246             if (null_token != prev_token) {
1247                 guint32 bigram_freq = 0;
1248                 merged_gram->get_total_freq(total_freq);
1249                 merged_gram->get_freq(token, bigram_freq);
1250                 if (0 != total_freq)
1251                     bigram_poss = bigram_freq / (gfloat)total_freq;
1252             }
1253         }
1254 
1255         /* compute the m_freq. */
1256         FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1257         phrase_index->get_phrase_item(token, cached_item);
1258         total_freq = phrase_index->get_phrase_index_total_freq();
1259         assert (0 < total_freq);
1260 
1261         /* Note: possibility value <= 1.0. */
1262         guint32 freq = (lambda * bigram_poss +
1263                         (1 - lambda) *
1264                         cached_item.get_unigram_frequency() /
1265                         (gfloat) total_freq) * 256 * 256 * 256;
1266         item->m_freq = freq;
1267     }
1268 }
1269 
_prepend_sentence_candidates(zhuyin_instance_t * instance,CandidateVector candidates)1270 static bool _prepend_sentence_candidates(zhuyin_instance_t * instance,
1271                                          CandidateVector candidates) {
1272     const size_t size = instance->m_nbest_results.size();
1273 
1274     /* check whether the nbest match candidate exists. */
1275     if (0 == size)
1276         return false;
1277 
1278     /* prepend nbest match candidates to candidates. */
1279     for (ssize_t i = size - 1; i >= 0; --i) {
1280         lookup_candidate_t candidate;
1281         candidate.m_candidate_type = BEST_MATCH_CANDIDATE;
1282         g_array_prepend_val(candidates, candidate);
1283     }
1284 
1285     return true;
1286 }
1287 
_compute_phrase_length(zhuyin_context_t * context,CandidateVector candidates)1288 static bool _compute_phrase_length(zhuyin_context_t * context,
1289                                    CandidateVector candidates) {
1290     FacadePhraseIndex * phrase_index = context->m_phrase_index;
1291 
1292     /* populate m_phrase_length in lookup_candidate_t. */
1293     PhraseItem item;
1294 
1295     for(size_t i = 0; i < candidates->len; ++i) {
1296         lookup_candidate_t * candidate = &g_array_index
1297             (candidates, lookup_candidate_t, i);
1298 
1299         switch(candidate->m_candidate_type) {
1300         case BEST_MATCH_CANDIDATE:
1301             assert(FALSE);
1302         case NORMAL_CANDIDATE_AFTER_CURSOR:
1303         case NORMAL_CANDIDATE_BEFORE_CURSOR: {
1304             phrase_index->get_phrase_item(candidate->m_token, item);
1305             candidate->m_phrase_length = item.get_phrase_length();
1306             break;
1307         }
1308         case ZOMBIE_CANDIDATE:
1309             assert(FALSE);
1310         }
1311     }
1312 
1313     return true;
1314 }
1315 
_compute_phrase_strings_of_items(zhuyin_instance_t * instance,CandidateVector candidates)1316 static bool _compute_phrase_strings_of_items(zhuyin_instance_t * instance,
1317                                              CandidateVector candidates) {
1318     /* populate m_phrase_string in lookup_candidate_t. */
1319 
1320     for(size_t i = 0; i < candidates->len; ++i) {
1321         lookup_candidate_t * candidate = &g_array_index
1322             (candidates, lookup_candidate_t, i);
1323 
1324         switch(candidate->m_candidate_type) {
1325         case BEST_MATCH_CANDIDATE: {
1326             gchar * sentence = NULL;
1327             zhuyin_get_sentence(instance, &sentence);
1328             candidate->m_phrase_string = sentence;
1329             break;
1330         }
1331         case NORMAL_CANDIDATE_AFTER_CURSOR:
1332         case NORMAL_CANDIDATE_BEFORE_CURSOR:
1333             _token_get_phrase
1334                 (instance->m_context->m_phrase_index,
1335                  candidate->m_token, NULL,
1336                  &(candidate->m_phrase_string));
1337             break;
1338         case ZOMBIE_CANDIDATE:
1339             assert(FALSE);
1340         }
1341     }
1342 
1343     return true;
1344 }
1345 
compare_indexed_item_with_phrase_string(gconstpointer lhs,gconstpointer rhs,gpointer userdata)1346 static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
1347                                                     gconstpointer rhs,
1348                                                     gpointer userdata) {
1349     size_t index_lhs = *((size_t *) lhs);
1350     size_t index_rhs = *((size_t *) rhs);
1351     CandidateVector candidates = (CandidateVector) userdata;
1352 
1353     lookup_candidate_t * candidate_lhs =
1354         &g_array_index(candidates, lookup_candidate_t, index_lhs);
1355     lookup_candidate_t * candidate_rhs =
1356         &g_array_index(candidates, lookup_candidate_t, index_rhs);
1357 
1358     return -strcmp(candidate_lhs->m_phrase_string,
1359                    candidate_rhs->m_phrase_string); /* in descendant order */
1360 }
1361 
1362 
_remove_duplicated_items_by_phrase_string(zhuyin_instance_t * instance,CandidateVector candidates)1363 static bool _remove_duplicated_items_by_phrase_string
1364 (zhuyin_instance_t * instance,
1365  CandidateVector candidates) {
1366     size_t i;
1367     /* create the GArray of indexed item */
1368     GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
1369     for (i = 0; i < candidates->len; ++i)
1370         g_array_append_val(indices, i);
1371 
1372     /* sort the indices array by phrase array */
1373     g_array_sort_with_data
1374         (indices, compare_indexed_item_with_phrase_string, candidates);
1375 
1376     /* mark duplicated items as zombie candidate */
1377     lookup_candidate_t * cur_item, * saved_item = NULL;
1378     for (i = 0; i < indices->len; ++i) {
1379         size_t cur_index = g_array_index(indices, size_t, i);
1380         cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index);
1381 
1382         /* handle the first candidate */
1383         if (NULL == saved_item) {
1384             saved_item = cur_item;
1385             continue;
1386         }
1387 
1388         if (0 == strcmp(saved_item->m_phrase_string,
1389                         cur_item->m_phrase_string)) {
1390             /* found duplicated candidates */
1391 
1392             /* keep nbest match candidate */
1393             if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) {
1394                 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1395                 continue;
1396             }
1397 
1398             if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
1399                 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1400                 saved_item = cur_item;
1401                 continue;
1402             }
1403 
1404             /* keep the higher possiblity one
1405                to quickly move the word forward in the candidate list */
1406             if (cur_item->m_freq > saved_item->m_freq) {
1407                 /* find better candidate */
1408                 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1409                 saved_item = cur_item;
1410                 continue;
1411             } else {
1412                 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1413                 continue;
1414             }
1415         } else {
1416             /* keep the current candidate */
1417             saved_item = cur_item;
1418         }
1419     }
1420 
1421     g_array_free(indices, TRUE);
1422 
1423     /* remove zombie candidate from the returned candidates */
1424     for (i = 0; i < candidates->len; ++i) {
1425         lookup_candidate_t * candidate = &g_array_index
1426             (candidates, lookup_candidate_t, i);
1427 
1428         if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) {
1429             g_free(candidate->m_phrase_string);
1430             g_array_remove_index(candidates, i);
1431             i--;
1432         }
1433     }
1434 
1435     return true;
1436 }
1437 
1438 /* offset must at the beginning of zero ChewingKey "'". */
_check_offset(PhoneticKeyMatrix & matrix,size_t offset)1439 static bool _check_offset(PhoneticKeyMatrix & matrix, size_t offset) {
1440     const size_t start = offset;
1441 
1442     ChewingKey key; ChewingKeyRest key_rest;
1443     const ChewingKey zero_key;
1444 
1445     if (start > 0) {
1446         const size_t index = start - 1;
1447         const size_t size = matrix.get_column_size(index);
1448         if (1 == size) {
1449             /* assume only one zero ChewingKey "'" here, but no check. */
1450             matrix.get_item(index, 0, key, key_rest);
1451             assert(zero_key != key);
1452         }
1453     }
1454 
1455     return true;
1456 }
1457 
zhuyin_guess_candidates_after_cursor(zhuyin_instance_t * instance,size_t offset)1458 bool zhuyin_guess_candidates_after_cursor(zhuyin_instance_t * instance,
1459                                           size_t offset) {
1460 
1461     zhuyin_context_t * & context = instance->m_context;
1462     zhuyin_option_t & options = context->m_options;
1463     PhoneticKeyMatrix & matrix = instance->m_matrix;
1464     CandidateVector candidates = instance->m_candidates;
1465 
1466     _free_candidates(candidates);
1467 
1468     if (0 == matrix.size())
1469         return false;
1470 
1471     /* lookup the previous token here. */
1472     phrase_token_t prev_token = null_token;
1473 
1474     if (options & DYNAMIC_ADJUST) {
1475         prev_token = _get_previous_token(instance, offset);
1476     }
1477 
1478     SingleGram merged_gram;
1479     SingleGram * system_gram = NULL, * user_gram = NULL;
1480 
1481     if (options & DYNAMIC_ADJUST) {
1482         if (null_token != prev_token) {
1483             context->m_system_bigram->load(prev_token, system_gram);
1484             context->m_user_bigram->load(prev_token, user_gram);
1485             merge_single_gram(&merged_gram, system_gram, user_gram);
1486         }
1487     }
1488 
1489     PhraseIndexRanges ranges;
1490     memset(ranges, 0, sizeof(ranges));
1491     context->m_phrase_index->prepare_ranges(ranges);
1492 
1493     _check_offset(matrix, offset);
1494 
1495     /* matrix reserved one extra slot. */
1496     const size_t start = offset;
1497     for (size_t end = start + 1; end < matrix.size(); ++end) {
1498         /* do pinyin search. */
1499         context->m_phrase_index->clear_ranges(ranges);
1500         int retval = search_matrix(context->m_pinyin_table, &matrix,
1501                                    start, end, ranges);
1502 
1503         if ( !(retval & SEARCH_OK) )
1504             continue;
1505 
1506         lookup_candidate_t template_item;
1507         template_item.m_begin = start; template_item.m_end = end;
1508         _append_items(ranges, &template_item, candidates);
1509 
1510         if ( !(retval & SEARCH_CONTINUED) )
1511             break;
1512     }
1513 
1514     context->m_phrase_index->destroy_ranges(ranges);
1515     if (system_gram)
1516         delete system_gram;
1517     if (user_gram)
1518         delete user_gram;
1519 
1520     /* post process to sort the candidates */
1521 
1522     _compute_phrase_length(context, candidates);
1523 
1524     _compute_frequency_of_items(context, prev_token, &merged_gram, candidates);
1525 
1526     /* sort the candidates by length and frequency. */
1527     g_array_sort(candidates, compare_item_with_length_and_frequency);
1528 
1529     /* post process to remove duplicated candidates */
1530 
1531     _prepend_sentence_candidates(instance, instance->m_candidates);
1532 
1533     _compute_phrase_strings_of_items(instance, instance->m_candidates);
1534 
1535     _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates);
1536 
1537     return true;
1538 }
1539 
zhuyin_guess_candidates_before_cursor(zhuyin_instance_t * instance,size_t offset)1540 bool zhuyin_guess_candidates_before_cursor(zhuyin_instance_t * instance,
1541                                           size_t offset) {
1542     zhuyin_context_t * & context = instance->m_context;
1543     zhuyin_option_t & options = context->m_options;
1544     PhoneticKeyMatrix & matrix = instance->m_matrix;
1545     CandidateVector candidates = instance->m_candidates;
1546 
1547     _free_candidates(candidates);
1548 
1549     if (0 == matrix.size())
1550         return false;
1551 
1552     PhraseIndexRanges ranges;
1553     memset(ranges, 0, sizeof(ranges));
1554     context->m_phrase_index->prepare_ranges(ranges);
1555 
1556     _check_offset(matrix, offset);
1557 
1558     GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1559 
1560     /* matrix reserved one extra slot. */
1561     for (size_t len = offset; len >= 1; --len) {
1562         _free_candidates(items);
1563         const size_t start = offset - len;
1564 
1565         /* lookup the previous token here. */
1566         phrase_token_t prev_token = null_token;
1567 
1568         if (options & DYNAMIC_ADJUST) {
1569             prev_token = _get_previous_token(instance, start);
1570         }
1571 
1572         SingleGram merged_gram;
1573         SingleGram * system_gram = NULL, * user_gram = NULL;
1574 
1575         if (options & DYNAMIC_ADJUST) {
1576             if (null_token != prev_token) {
1577                 context->m_system_bigram->load(prev_token, system_gram);
1578                 context->m_user_bigram->load(prev_token, user_gram);
1579                 merge_single_gram(&merged_gram, system_gram, user_gram);
1580             }
1581         }
1582 
1583         /* do pinyin search. */
1584         context->m_phrase_index->clear_ranges(ranges);
1585         int retval = search_matrix(context->m_pinyin_table, &matrix,
1586                                    start, offset, ranges);
1587 
1588         if ( !(retval & SEARCH_OK) )
1589             continue;
1590 
1591         lookup_candidate_t template_item;
1592         template_item.m_candidate_type = NORMAL_CANDIDATE_BEFORE_CURSOR;
1593         template_item.m_begin = start; template_item.m_end = offset;
1594         _append_items(ranges, &template_item, items);
1595 
1596         if (system_gram)
1597             delete system_gram;
1598         if (user_gram)
1599             delete user_gram;
1600 
1601         /* post process to sort the items */
1602 
1603         _compute_phrase_length(context, items);
1604 
1605         _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1606 
1607         /* sort the items by length and frequency. */
1608         g_array_sort(items, compare_item_with_length_and_frequency);
1609 
1610         g_array_append_vals(candidates, items->data, items->len);
1611 
1612 #if 0
1613         /* no continue information. */
1614         if ( !(retval & SEARCH_CONTINUED) )
1615             break;
1616 #endif
1617     }
1618 
1619     _free_candidates(items);
1620     context->m_phrase_index->destroy_ranges(ranges);
1621 
1622     /* post process to remove duplicated candidates */
1623 
1624     _prepend_sentence_candidates(instance, instance->m_candidates);
1625 
1626     _compute_phrase_strings_of_items(instance, instance->m_candidates);
1627 
1628     _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates);
1629 
1630     return true;
1631 }
1632 
zhuyin_choose_candidate(zhuyin_instance_t * instance,size_t offset,lookup_candidate_t * candidate)1633 int zhuyin_choose_candidate(zhuyin_instance_t * instance,
1634                             size_t offset,
1635                             lookup_candidate_t * candidate){
1636     zhuyin_context_t * & context = instance->m_context;
1637     PhoneticKeyMatrix & matrix = instance->m_matrix;
1638     ForwardPhoneticConstraints * constraints = instance->m_constraints;
1639     NBestMatchResults & results = instance->m_nbest_results;
1640 
1641     if (BEST_MATCH_CANDIDATE == candidate->m_candidate_type)
1642         return matrix.size() - 1;
1643 
1644     /* sync m_constraints to the length of m_pinyin_keys. */
1645     bool retval = constraints->validate_constraint(&matrix);
1646 
1647     if (NORMAL_CANDIDATE_AFTER_CURSOR == candidate->m_candidate_type) {
1648         phrase_token_t token = candidate->m_token;
1649         guint8 len = constraints->add_constraint
1650             (candidate->m_begin, candidate->m_end, token);
1651         offset = candidate->m_end;
1652     }
1653 
1654     if (NORMAL_CANDIDATE_BEFORE_CURSOR == candidate->m_candidate_type) {
1655         phrase_token_t token = candidate->m_token;
1656         guint8 len = constraints->add_constraint
1657             (candidate->m_begin, candidate->m_end, token);
1658         offset = candidate->m_begin;
1659     }
1660 
1661     /* safe guard: validate the m_constraints again. */
1662     retval = constraints->validate_constraint(&matrix);
1663 
1664     return offset;
1665 }
1666 
zhuyin_clear_constraint(zhuyin_instance_t * instance,size_t offset)1667 bool zhuyin_clear_constraint(zhuyin_instance_t * instance,
1668                              size_t offset){
1669     ForwardPhoneticConstraints * constraints = instance->m_constraints;
1670 
1671     bool retval = constraints->clear_constraint(offset);
1672 
1673     return retval;
1674 }
1675 
zhuyin_lookup_tokens(zhuyin_instance_t * instance,const char * phrase,GArray * tokenarray)1676 bool zhuyin_lookup_tokens(zhuyin_instance_t * instance,
1677                           const char * phrase, GArray * tokenarray){
1678     zhuyin_context_t * & context = instance->m_context;
1679     FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1680 
1681     glong ucs4_len = 0;
1682     ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL);
1683 
1684     PhraseTokens tokens;
1685     memset(tokens, 0, sizeof(PhraseTokens));
1686     phrase_index->prepare_tokens(tokens);
1687     int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens);
1688     int num = reduce_tokens(tokens, tokenarray);
1689     phrase_index->destroy_tokens(tokens);
1690 
1691     return SEARCH_OK & retval;
1692 }
1693 
zhuyin_train(zhuyin_instance_t * instance)1694 bool zhuyin_train(zhuyin_instance_t * instance){
1695     if (!instance->m_context->m_user_dir)
1696         return false;
1697 
1698     zhuyin_context_t * context = instance->m_context;
1699     PhoneticKeyMatrix & matrix = instance->m_matrix;
1700     NBestMatchResults & results = instance->m_nbest_results;
1701 
1702     if (0 == results.size())
1703         return false;
1704 
1705     context->m_modified = true;
1706 
1707     MatchResult result = NULL;
1708     assert(results.get_result(0, result));
1709 
1710     bool retval = context->m_pinyin_lookup->train_result3
1711         (&matrix, instance->m_constraints, result);
1712 
1713     return retval;
1714 }
1715 
zhuyin_reset(zhuyin_instance_t * instance)1716 bool zhuyin_reset(zhuyin_instance_t * instance){
1717     instance->m_parsed_len = 0;
1718     instance->m_matrix.clear_all();
1719 
1720     g_array_set_size(instance->m_prefixes, 0);
1721 
1722     instance->m_constraints->clear();
1723     instance->m_nbest_results.clear();
1724     g_array_set_size(instance->m_phrase_result, 0);
1725     _free_candidates(instance->m_candidates);
1726 
1727     return true;
1728 }
1729 
zhuyin_get_zhuyin_string(zhuyin_instance_t * instance,ChewingKey * key,gchar ** utf8_str)1730 bool zhuyin_get_zhuyin_string(zhuyin_instance_t * instance,
1731                               ChewingKey * key,
1732                               gchar ** utf8_str) {
1733     *utf8_str = NULL;
1734     if (0 == key->get_table_index())
1735         return false;
1736 
1737     *utf8_str = key->get_zhuyin_string();
1738     return true;
1739 }
1740 
zhuyin_get_pinyin_string(zhuyin_instance_t * instance,ChewingKey * key,gchar ** utf8_str)1741 bool zhuyin_get_pinyin_string(zhuyin_instance_t * instance,
1742                               ChewingKey * key,
1743                               gchar ** utf8_str) {
1744     zhuyin_context_t * context = instance->m_context;
1745     FullPinyinScheme scheme = context->m_full_pinyin_scheme;
1746 
1747     *utf8_str = NULL;
1748     if (0 == key->get_table_index())
1749         return false;
1750 
1751     switch(scheme) {
1752     case FULL_PINYIN_HANYU:
1753         *utf8_str = key->get_pinyin_string();
1754         break;
1755     case FULL_PINYIN_LUOMA:
1756         *utf8_str = key->get_luoma_pinyin_string();
1757         break;
1758     case FULL_PINYIN_SECONDARY_ZHUYIN:
1759         *utf8_str = key->get_secondary_zhuyin_string();
1760         break;
1761     }
1762 
1763     return true;
1764 }
1765 
zhuyin_token_get_phrase(zhuyin_instance_t * instance,phrase_token_t token,guint * len,gchar ** utf8_str)1766 bool zhuyin_token_get_phrase(zhuyin_instance_t * instance,
1767                              phrase_token_t token,
1768                              guint * len,
1769                              gchar ** utf8_str) {
1770     zhuyin_context_t * & context = instance->m_context;
1771 
1772     return _token_get_phrase(context->m_phrase_index,
1773                              token, len, utf8_str);
1774 }
1775 
zhuyin_token_get_n_pronunciation(zhuyin_instance_t * instance,phrase_token_t token,guint * num)1776 bool zhuyin_token_get_n_pronunciation(zhuyin_instance_t * instance,
1777                                       phrase_token_t token,
1778                                       guint * num){
1779     *num = 0;
1780     zhuyin_context_t * & context = instance->m_context;
1781     PhraseItem item;
1782 
1783     int retval = context->m_phrase_index->get_phrase_item(token, item);
1784     if (ERROR_OK != retval)
1785         return false;
1786 
1787     *num = item.get_n_pronunciation();
1788     return true;
1789 }
1790 
zhuyin_token_get_nth_pronunciation(zhuyin_instance_t * instance,phrase_token_t token,guint nth,ChewingKeyVector keys)1791 bool zhuyin_token_get_nth_pronunciation(zhuyin_instance_t * instance,
1792                                         phrase_token_t token,
1793                                         guint nth,
1794                                         ChewingKeyVector keys){
1795     g_array_set_size(keys, 0);
1796     zhuyin_context_t * & context = instance->m_context;
1797     PhraseItem item;
1798     ChewingKey buffer[MAX_PHRASE_LENGTH];
1799     guint32 freq = 0;
1800 
1801     int retval = context->m_phrase_index->get_phrase_item(token, item);
1802     if (ERROR_OK != retval)
1803         return false;
1804 
1805     item.get_nth_pronunciation(nth, buffer, freq);
1806     guint8 len = item.get_phrase_length();
1807     g_array_append_vals(keys, buffer, len);
1808     return true;
1809 }
1810 
zhuyin_token_get_unigram_frequency(zhuyin_instance_t * instance,phrase_token_t token,guint * freq)1811 bool zhuyin_token_get_unigram_frequency(zhuyin_instance_t * instance,
1812                                         phrase_token_t token,
1813                                         guint * freq) {
1814     *freq = 0;
1815     zhuyin_context_t * & context = instance->m_context;
1816     PhraseItem item;
1817 
1818     int retval = context->m_phrase_index->get_phrase_item(token, item);
1819     if (ERROR_OK != retval)
1820         return false;
1821 
1822     *freq = item.get_unigram_frequency();
1823     return true;
1824 }
1825 
zhuyin_token_add_unigram_frequency(zhuyin_instance_t * instance,phrase_token_t token,guint delta)1826 bool zhuyin_token_add_unigram_frequency(zhuyin_instance_t * instance,
1827                                         phrase_token_t token,
1828                                         guint delta){
1829     zhuyin_context_t * & context = instance->m_context;
1830     int retval = context->m_phrase_index->add_unigram_frequency
1831         (token, delta);
1832     return ERROR_OK == retval;
1833 }
1834 
zhuyin_get_n_candidate(zhuyin_instance_t * instance,guint * num)1835 bool zhuyin_get_n_candidate(zhuyin_instance_t * instance,
1836                             guint * num) {
1837     *num = instance->m_candidates->len;
1838     return true;
1839 }
1840 
zhuyin_get_candidate(zhuyin_instance_t * instance,guint index,lookup_candidate_t ** candidate)1841 bool zhuyin_get_candidate(zhuyin_instance_t * instance,
1842                           guint index,
1843                           lookup_candidate_t ** candidate) {
1844     CandidateVector & candidates = instance->m_candidates;
1845 
1846     *candidate = NULL;
1847 
1848     if (index >= candidates->len)
1849         return false;
1850 
1851     *candidate = &g_array_index(candidates, lookup_candidate_t, index);
1852 
1853     return true;
1854 }
1855 
zhuyin_get_candidate_type(zhuyin_instance_t * instance,lookup_candidate_t * candidate,lookup_candidate_type_t * type)1856 bool zhuyin_get_candidate_type(zhuyin_instance_t * instance,
1857                                lookup_candidate_t * candidate,
1858                                lookup_candidate_type_t * type) {
1859     *type = candidate->m_candidate_type;
1860     return true;
1861 }
1862 
zhuyin_get_candidate_string(zhuyin_instance_t * instance,lookup_candidate_t * candidate,const gchar ** utf8_str)1863 bool zhuyin_get_candidate_string(zhuyin_instance_t * instance,
1864                                  lookup_candidate_t * candidate,
1865                                  const gchar ** utf8_str) {
1866     *utf8_str = candidate->m_phrase_string;
1867     return true;
1868 }
1869 
1870 #if 0
1871 bool zhuyin_get_n_zhuyin(zhuyin_instance_t * instance,
1872                          guint * num) {
1873     *num = 0;
1874 
1875     if (instance->m_pinyin_keys->len !=
1876         instance->m_pinyin_key_rests->len)
1877         return false;
1878 
1879     *num = instance->m_pinyin_keys->len;
1880     return true;
1881 }
1882 #endif
1883 
zhuyin_get_zhuyin_key(zhuyin_instance_t * instance,size_t offset,ChewingKey ** ppkey)1884 bool zhuyin_get_zhuyin_key(zhuyin_instance_t * instance,
1885                            size_t offset,
1886                            ChewingKey ** ppkey) {
1887     PhoneticKeyMatrix & matrix = instance->m_matrix;
1888     *ppkey = NULL;
1889 
1890     if (offset >= matrix.size() - 1)
1891         return false;
1892 
1893     if (0 == matrix.get_column_size(offset))
1894         return false;
1895 
1896     _check_offset(matrix, offset);
1897 
1898     static ChewingKey key;
1899     ChewingKeyRest key_rest;
1900     matrix.get_item(offset, 0, key, key_rest);
1901 
1902     *ppkey = &key;
1903     return true;
1904 }
1905 
zhuyin_get_zhuyin_key_rest(zhuyin_instance_t * instance,size_t offset,ChewingKeyRest ** ppkey_rest)1906 bool zhuyin_get_zhuyin_key_rest(zhuyin_instance_t * instance,
1907                                 size_t offset,
1908                                 ChewingKeyRest ** ppkey_rest) {
1909     PhoneticKeyMatrix & matrix = instance->m_matrix;
1910     *ppkey_rest = NULL;
1911 
1912     if (offset >= matrix.size() - 1)
1913         return false;
1914 
1915     if (0 == matrix.get_column_size(offset))
1916         return false;
1917 
1918     _check_offset(matrix, offset);
1919 
1920     ChewingKey key;
1921     static ChewingKeyRest key_rest;
1922     matrix.get_item(offset, 0, key, key_rest);
1923 
1924     *ppkey_rest = &key_rest;
1925     return true;
1926 }
1927 
zhuyin_get_zhuyin_key_rest_positions(zhuyin_instance_t * instance,ChewingKeyRest * key_rest,guint16 * begin,guint16 * end)1928 bool zhuyin_get_zhuyin_key_rest_positions(zhuyin_instance_t * instance,
1929                                           ChewingKeyRest * key_rest,
1930                                           guint16 * begin, guint16 * end) {
1931     if (begin)
1932         *begin = key_rest->m_raw_begin;
1933 
1934     if (end)
1935         *end = key_rest->m_raw_end;
1936 
1937     return true;
1938 }
1939 
zhuyin_get_zhuyin_key_rest_length(zhuyin_instance_t * instance,ChewingKeyRest * key_rest,guint16 * length)1940 bool zhuyin_get_zhuyin_key_rest_length(zhuyin_instance_t * instance,
1941                                        ChewingKeyRest * key_rest,
1942                                        guint16 * length) {
1943     *length = key_rest->length();
1944     return true;
1945 }
1946 
1947 /* when lookup offset:
1948    get the previous non-zero ChewingKey. */
zhuyin_get_zhuyin_offset(zhuyin_instance_t * instance,size_t cursor,size_t * poffset)1949 bool zhuyin_get_zhuyin_offset(zhuyin_instance_t * instance,
1950                               size_t cursor,
1951                               size_t * poffset) {
1952     PhoneticKeyMatrix & matrix = instance->m_matrix;
1953     size_t offset = std_lite::min(cursor, instance->m_parsed_len);
1954 
1955     /* find the first ChewingKey. */
1956     for (; offset > 0; --offset) {
1957         const size_t size = matrix.get_column_size(offset);
1958 
1959         if (size > 0)
1960             break;
1961     }
1962 
1963     _check_offset(matrix, offset);
1964 
1965     *poffset = offset;
1966     return true;
1967 }
1968 
zhuyin_get_left_zhuyin_offset(zhuyin_instance_t * instance,size_t offset,size_t * pleft)1969 bool zhuyin_get_left_zhuyin_offset(zhuyin_instance_t * instance,
1970                                    size_t offset,
1971                                    size_t * pleft) {
1972     PhoneticKeyMatrix & matrix = instance->m_matrix;
1973     _check_offset(matrix, offset);
1974 
1975     /* find the ChewingKey ends at offset. */
1976     size_t left = offset > 0 ? offset - 1 : 0;
1977 
1978     ChewingKey key; ChewingKeyRest key_rest;
1979     for (; left > 0; --left) {
1980         const size_t size = matrix.get_column_size(left);
1981 
1982         size_t i = 0;
1983         for (; i < size; ++i) {
1984             matrix.get_item(left, i, key, key_rest);
1985 
1986             if (offset == key_rest.m_raw_end)
1987                 break;
1988         }
1989 
1990         if (i < size)
1991             break;
1992     }
1993 
1994     _check_offset(matrix, left);
1995 
1996     *pleft = left;
1997     return true;
1998 }
1999 
zhuyin_get_right_zhuyin_offset(zhuyin_instance_t * instance,size_t offset,size_t * pright)2000 bool zhuyin_get_right_zhuyin_offset(zhuyin_instance_t * instance,
2001                                     size_t offset,
2002                                     size_t * pright) {
2003     PhoneticKeyMatrix & matrix = instance->m_matrix;
2004     _check_offset(matrix, offset);
2005 
2006     /* find the first non-zero ChewingKey. */
2007     size_t right = offset;
2008 
2009     ChewingKey key; ChewingKeyRest key_rest;
2010     for (size_t index = right; index < matrix.size() - 1; ++index) {
2011         const size_t size = matrix.get_column_size(index);
2012 
2013         if (1 != size)
2014             break;
2015 
2016         matrix.get_item(index, 0, key, key_rest);
2017         break;
2018     }
2019 
2020     if (0 == matrix.get_column_size(right))
2021         return false;
2022 
2023     matrix.get_item(right, 0, key, key_rest);
2024     right = key_rest.m_raw_end;
2025     _check_offset(matrix, right);
2026 
2027     *pright = right;
2028     return true;
2029 }
2030 
_pre_compute_tokens(zhuyin_context_t * context,TokenVector cached_tokens,ucs4_t * phrase,size_t phrase_length)2031 static bool _pre_compute_tokens(zhuyin_context_t * context,
2032                                 TokenVector cached_tokens,
2033                                 ucs4_t * phrase,
2034                                 size_t phrase_length) {
2035     FacadePhraseIndex * phrase_index = context->m_phrase_index;
2036     FacadePhraseTable3 * phrase_table = context->m_phrase_table;
2037 
2038     /* do phrase table search. */
2039     PhraseTokens tokens;
2040     memset(tokens, 0, sizeof(PhraseTokens));
2041     phrase_index->prepare_tokens(tokens);
2042 
2043     for (size_t i = 0; i < phrase_length; ++i) {
2044         phrase_token_t token = null_token;
2045         ucs4_t character = phrase[i];
2046 
2047         phrase_index->clear_tokens(tokens);
2048         int retval = phrase_table->search(1, &character, tokens);
2049 
2050         int num = get_first_token(tokens, token);
2051         /* en-counter un-known character, such as the emoji unicode. */
2052         if (0 == num) {
2053             phrase_index->destroy_tokens(tokens);
2054             return false;
2055         }
2056 
2057         g_array_append_val(cached_tokens, token);
2058     }
2059 
2060     phrase_index->destroy_tokens(tokens);
2061 
2062     return true;
2063 }
2064 
_get_char_offset_recur(zhuyin_instance_t * instance,TokenVector cached_tokens,size_t start,size_t offset,size_t * plength)2065 static bool _get_char_offset_recur(zhuyin_instance_t * instance,
2066                                    TokenVector cached_tokens,
2067                                    size_t start,
2068                                    size_t offset,
2069                                    size_t * plength) {
2070     zhuyin_context_t * context = instance->m_context;
2071     PhoneticKeyMatrix & matrix = instance->m_matrix;
2072     FacadePhraseIndex * phrase_index = context->m_phrase_index;
2073     size_t length = *plength;
2074 
2075     if (start > offset)
2076         return true;
2077 
2078     const size_t size = matrix.get_column_size(start);
2079     /* assume pinyin parsers will filter invalid keys. */
2080     assert(size > 0);
2081 
2082     bool result = false;
2083 
2084     PhraseItem item;
2085     for (size_t i = 0; i < size; ++i) {
2086         ChewingKey key; ChewingKeyRest key_rest;
2087         matrix.get_item(start, i, key, key_rest);
2088 
2089         const size_t newstart = key_rest.m_raw_end;
2090 
2091         /* check pronunciation */
2092         phrase_token_t token = g_array_index
2093             (cached_tokens, phrase_token_t, length);
2094         phrase_index->get_phrase_item(token, item);
2095 
2096         gfloat pinyin_poss = item.get_pronunciation_possibility(&key);
2097         if (pinyin_poss < FLT_EPSILON)
2098             continue;
2099 
2100         if (newstart > offset)
2101             return true;
2102 
2103         ++length;
2104 
2105         result = _get_char_offset_recur
2106             (instance, cached_tokens, newstart, offset, &length);
2107         if (result) {
2108             *plength = length;
2109             return result;
2110         }
2111 
2112         --length;
2113     }
2114 
2115     return result;
2116 }
2117 
zhuyin_get_character_offset(zhuyin_instance_t * instance,const char * phrase,size_t offset,size_t * plength)2118 bool zhuyin_get_character_offset(zhuyin_instance_t * instance,
2119                                  const char * phrase,
2120                                  size_t offset,
2121                                  size_t * plength) {
2122     zhuyin_context_t * context = instance->m_context;
2123     PhoneticKeyMatrix & matrix = instance->m_matrix;
2124 
2125     if (0 == matrix.size())
2126         return false;
2127 
2128     assert(offset < matrix.size());
2129     _check_offset(matrix, offset);
2130 
2131     if (NULL == phrase)
2132         return false;
2133 
2134     glong phrase_length = 0;
2135     ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &phrase_length, NULL);
2136 
2137     if (0 == phrase_length)
2138         return false;
2139 
2140     size_t length = 0;
2141     const size_t start = 0;
2142 
2143     /* pre-compute the tokens vector from phrase. */
2144     TokenVector cached_tokens = g_array_new(TRUE, TRUE, sizeof(phrase_token_t));
2145 
2146     bool retval = _pre_compute_tokens
2147         (context, cached_tokens, ucs4_phrase, phrase_length);
2148 
2149     if (!retval) {
2150         g_array_free(cached_tokens, TRUE);
2151         g_free(ucs4_phrase);
2152         return false;
2153     }
2154 
2155     assert(cached_tokens->len == phrase_length);
2156 
2157     bool result = _get_char_offset_recur
2158         (instance, cached_tokens, start, offset, &length);
2159 
2160     g_array_free(cached_tokens, TRUE);
2161     g_free(ucs4_phrase);
2162 
2163     *plength = length;
2164     return result;
2165 }
2166 
2167 #if 0
2168 bool zhuyin_get_character_offset(zhuyin_instance_t * instance,
2169                                  size_t offset,
2170                                  size_t * plength) {
2171     zhuyin_context_t * context = instance->m_context;
2172     FacadePhraseIndex * phrase_index = context->m_phrase_index;
2173 
2174     PhoneticKeyMatrix & matrix = instance->m_matrix;
2175     MatchResults results = instance->m_match_results;
2176     _check_offset(matrix, offset);
2177 
2178     size_t length = 0;
2179     PhraseItem item;
2180     for (size_t i = 0; i < offset; ++i) {
2181         phrase_token_t token = g_array_index(results, phrase_token_t, i);
2182         if (null_token == token)
2183             continue;
2184 
2185         int retval = phrase_index->get_phrase_item(token, item);
2186         assert(ERROR_OK == retval);
2187         guint8 len = item.get_phrase_length();
2188         length += len;
2189     }
2190 
2191     *plength = length;
2192     return true;
2193 }
2194 #endif
2195 
2196 
zhuyin_get_n_phrase(zhuyin_instance_t * instance,guint * num)2197 bool zhuyin_get_n_phrase(zhuyin_instance_t * instance,
2198                          guint * num) {
2199     *num = instance->m_phrase_result->len;
2200     return true;
2201 }
2202 
zhuyin_get_phrase_token(zhuyin_instance_t * instance,guint index,phrase_token_t * token)2203 bool zhuyin_get_phrase_token(zhuyin_instance_t * instance,
2204                              guint index,
2205                              phrase_token_t * token){
2206     MatchResult & result = instance->m_phrase_result;
2207 
2208     *token = null_token;
2209 
2210     if (index >= result->len)
2211         return false;
2212 
2213     *token = g_array_index(result, phrase_token_t, index);
2214 
2215     return true;
2216 }
2217 
2218 /**
2219  *  Note: prefix is the text before the pre-edit string.
2220  */
2221