1 /*
2 * libpinyin
3 * Library to deal with pinyin.
4 *
5 * Copyright (C) 2017 Peng Wu <alexepico@gmail.com>
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "zhuyin.h"
22 #include <stdio.h>
23 #include <unistd.h>
24 #include <glib/gstdio.h>
25 #include "pinyin_internal.h"
26
27
28 using namespace pinyin;
29
30 /* a glue layer for input method integration. */
31
32 typedef GArray * CandidateVector; /* GArray of lookup_candidate_t */
33
34 struct _zhuyin_context_t{
35 zhuyin_option_t m_options;
36
37 /* input parsers. */
38 FullPinyinScheme m_full_pinyin_scheme;
39 FullPinyinParser2 * m_full_pinyin_parser;
40 ZhuyinParser2 * m_chewing_parser;
41
42 /* default tables. */
43 FacadeChewingTable2 * m_pinyin_table;
44 FacadePhraseTable3 * m_phrase_table;
45 FacadePhraseIndex * m_phrase_index;
46 Bigram * m_system_bigram;
47 Bigram * m_user_bigram;
48
49 /* lookups. */
50 PhoneticLookup<1, 1> * m_pinyin_lookup;
51 PhraseLookup * m_phrase_lookup;
52
53 char * m_system_dir;
54 char * m_user_dir;
55 bool m_modified;
56
57 SystemTableInfo2 m_system_table_info;
58 };
59
60 struct _zhuyin_instance_t{
61 /* pointer of zhuyin_context_t. */
62 zhuyin_context_t * m_context;
63
64 /* the tokens of phrases before the user input. */
65 TokenVector m_prefixes;
66
67 /* cached parsed pinyin keys. */
68 PhoneticKeyMatrix m_matrix;
69 size_t m_parsed_len;
70
71 /* cached pinyin lookup variables. */
72 ForwardPhoneticConstraints * m_constraints;
73 NBestMatchResults m_nbest_results;
74 TokenVector m_phrase_result;
75 CandidateVector m_candidates;
76 };
77
78 struct _lookup_candidate_t{
79 lookup_candidate_type_t m_candidate_type;
80 gchar * m_phrase_string;
81 phrase_token_t m_token;
82 guint8 m_phrase_length;
83 guint16 m_begin; /* must contain the preceding "'" character. */
84 guint16 m_end; /* must not contain the following "'" character. */
85 guint32 m_freq; /* the amplifed gfloat numerical value. */
86
87 public:
_lookup_candidate_t_lookup_candidate_t88 _lookup_candidate_t() {
89 m_candidate_type = NORMAL_CANDIDATE_AFTER_CURSOR;
90 m_phrase_string = NULL;
91 m_token = null_token;
92 m_phrase_length = 0;
93 m_begin = 0; m_end = 0;
94 m_freq = 0;
95 }
96 };
97
98 struct _import_iterator_t{
99 zhuyin_context_t * m_context;
100 guint8 m_phrase_index;
101 };
102
_clean_user_files(const char * user_dir,const pinyin_table_info_t * phrase_files)103 static bool _clean_user_files(const char * user_dir,
104 const pinyin_table_info_t * phrase_files){
105 /* clean up files, if version mis-matches. */
106 for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
107 const pinyin_table_info_t * table_info = phrase_files + i;
108
109 if (NOT_USED == table_info->m_file_type)
110 continue;
111
112 if (NULL == table_info->m_user_filename)
113 continue;
114
115 const char * userfilename = table_info->m_user_filename;
116
117 /* remove dbin file. */
118 gchar * filename = g_build_filename(user_dir, userfilename, NULL);
119 unlink(filename);
120 g_free(filename);
121 }
122
123 return true;
124 }
125
check_format(zhuyin_context_t * context)126 static bool check_format(zhuyin_context_t * context){
127 const char * user_dir = context->m_user_dir;
128
129 UserTableInfo user_table_info;
130 gchar * filename = g_build_filename
131 (user_dir, USER_TABLE_INFO, NULL);
132 user_table_info.load(filename);
133 g_free(filename);
134
135 bool exists = user_table_info.is_conform
136 (&context->m_system_table_info);
137
138 if (exists)
139 return exists;
140
141 const pinyin_table_info_t * phrase_files = NULL;
142
143 phrase_files = context->m_system_table_info.get_default_tables();
144 _clean_user_files(user_dir, phrase_files);
145
146 filename = g_build_filename
147 (user_dir, USER_PINYIN_INDEX, NULL);
148 unlink(filename);
149 g_free(filename);
150
151 filename = g_build_filename
152 (user_dir, USER_PHRASE_INDEX, NULL);
153 unlink(filename);
154 g_free(filename);
155
156 filename = g_build_filename
157 (user_dir, USER_BIGRAM, NULL);
158 unlink(filename);
159 g_free(filename);
160
161 return exists;
162 }
163
mark_version(zhuyin_context_t * context)164 static bool mark_version(zhuyin_context_t * context){
165 const char * userdir = context->m_user_dir;
166
167 UserTableInfo user_table_info;
168 user_table_info.make_conform(&context->m_system_table_info);
169
170 gchar * filename = g_build_filename
171 (userdir, USER_TABLE_INFO, NULL);
172 bool retval = user_table_info.save(filename);
173 g_free(filename);
174
175 return retval;
176 }
177
_load_phrase_library(const char * system_dir,const char * user_dir,FacadePhraseIndex * phrase_index,const pinyin_table_info_t * table_info)178 static bool _load_phrase_library (const char * system_dir,
179 const char * user_dir,
180 FacadePhraseIndex * phrase_index,
181 const pinyin_table_info_t * table_info){
182 /* check whether the sub phrase index is already loaded. */
183 PhraseIndexRange range;
184 guint8 index = table_info->m_dict_index;
185
186 int retval = phrase_index->get_range(index, range);
187 if (ERROR_OK == retval)
188 return false;
189
190 if (SYSTEM_FILE == table_info->m_file_type) {
191 /* system phrase library */
192 MemoryChunk * chunk = new MemoryChunk;
193
194 const char * systemfilename = table_info->m_system_filename;
195 /* check bin file in system dir. */
196 gchar * chunkfilename = g_build_filename(system_dir,
197 systemfilename, NULL);
198 #ifdef LIBPINYIN_USE_MMAP
199 if (!chunk->mmap(chunkfilename))
200 fprintf(stderr, "mmap %s failed!\n", chunkfilename);
201 #else
202 if (!chunk->load(chunkfilename))
203 fprintf(stderr, "open %s failed!\n", chunkfilename);
204 #endif
205
206 g_free(chunkfilename);
207
208 phrase_index->load(index, chunk);
209
210 const char * userfilename = table_info->m_user_filename;
211
212 chunkfilename = g_build_filename(user_dir,
213 userfilename, NULL);
214
215 MemoryChunk * log = new MemoryChunk;
216 log->load(chunkfilename);
217 g_free(chunkfilename);
218
219 /* merge the chunk log. */
220 phrase_index->merge(index, log);
221 return true;
222 }
223
224 if (DICTIONARY == table_info->m_file_type) {
225 /* addon dictionary. */
226 MemoryChunk * chunk = new MemoryChunk;
227
228 const char * systemfilename = table_info->m_system_filename;
229 /* check bin file in system dir. */
230 gchar * chunkfilename = g_build_filename(system_dir,
231 systemfilename, NULL);
232 #ifdef LIBPINYIN_USE_MMAP
233 if (!chunk->mmap(chunkfilename))
234 fprintf(stderr, "mmap %s failed!\n", chunkfilename);
235 #else
236 if (!chunk->load(chunkfilename))
237 fprintf(stderr, "open %s failed!\n", chunkfilename);
238 #endif
239
240 g_free(chunkfilename);
241
242 phrase_index->load(index, chunk);
243
244 return true;
245 }
246
247 if (USER_FILE == table_info->m_file_type) {
248 /* user phrase library */
249 MemoryChunk * chunk = new MemoryChunk;
250 const char * userfilename = table_info->m_user_filename;
251
252 gchar * chunkfilename = g_build_filename(user_dir,
253 userfilename, NULL);
254
255 /* check bin file exists. if not, create a new one. */
256 if (chunk->load(chunkfilename)) {
257 phrase_index->load(index, chunk);
258 } else {
259 delete chunk;
260 phrase_index->create_sub_phrase(index);
261 }
262
263 g_free(chunkfilename);
264 return true;
265 }
266
267 return false;
268 }
269
zhuyin_init(const char * systemdir,const char * userdir)270 zhuyin_context_t * zhuyin_init(const char * systemdir, const char * userdir){
271 zhuyin_context_t * context = new zhuyin_context_t;
272
273 context->m_options = USE_TONE | FORCE_TONE;
274
275 context->m_system_dir = g_strdup(systemdir);
276 context->m_user_dir = g_strdup(userdir);
277 context->m_modified = false;
278
279 gchar * filename = g_build_filename
280 (context->m_system_dir, SYSTEM_TABLE_INFO, NULL);
281 if (!context->m_system_table_info.load(filename)) {
282 fprintf(stderr, "load %s failed!\n", filename);
283 return NULL;
284 }
285 g_free(filename);
286
287
288 check_format(context);
289
290 context->m_full_pinyin_scheme = FULL_PINYIN_DEFAULT;
291 context->m_full_pinyin_parser = new FullPinyinParser2;
292 context->m_chewing_parser = new ZhuyinSimpleParser2;
293
294 /* load chewing table. */
295 context->m_pinyin_table = new FacadeChewingTable2;
296
297 gchar * system_filename = g_build_filename
298 (context->m_system_dir, SYSTEM_PINYIN_INDEX, NULL);
299 gchar * user_filename = g_build_filename
300 (context->m_user_dir, USER_PINYIN_INDEX, NULL);
301 context->m_pinyin_table->load(system_filename, user_filename);
302 g_free(user_filename);
303 g_free(system_filename);
304
305 /* load phrase table */
306 context->m_phrase_table = new FacadePhraseTable3;
307
308 system_filename = g_build_filename
309 (context->m_system_dir, SYSTEM_PHRASE_INDEX, NULL);
310 user_filename = g_build_filename
311 (context->m_user_dir, USER_PHRASE_INDEX, NULL);
312 context->m_phrase_table->load(system_filename, user_filename);
313 g_free(user_filename);
314 g_free(system_filename);
315
316 context->m_phrase_index = new FacadePhraseIndex;
317
318 /* load all default tables. */
319 for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i){
320 const pinyin_table_info_t * phrase_files =
321 context->m_system_table_info.get_default_tables();
322
323 const pinyin_table_info_t * table_info =
324 phrase_files + i;
325
326 if (NOT_USED == table_info->m_file_type)
327 continue;
328
329 /* addon dictionary should not in default tables. */
330 assert(DICTIONARY != table_info->m_file_type);
331
332 _load_phrase_library(context->m_system_dir, context->m_user_dir,
333 context->m_phrase_index, table_info);
334 }
335
336 context->m_system_bigram = new Bigram;
337 filename = g_build_filename(context->m_system_dir, SYSTEM_BIGRAM, NULL);
338 context->m_system_bigram->attach(filename, ATTACH_READONLY);
339 g_free(filename);
340
341 context->m_user_bigram = new Bigram;
342 filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL);
343 context->m_user_bigram->load_db(filename);
344 g_free(filename);
345
346 gfloat lambda = context->m_system_table_info.get_lambda();
347
348 context->m_pinyin_lookup = new PhoneticLookup<1, 1>
349 ( lambda,
350 context->m_pinyin_table, context->m_phrase_index,
351 context->m_system_bigram, context->m_user_bigram);
352
353 context->m_phrase_lookup = new PhraseLookup
354 (lambda,
355 context->m_phrase_table, context->m_phrase_index,
356 context->m_system_bigram, context->m_user_bigram);
357
358 return context;
359 }
360
zhuyin_load_phrase_library(zhuyin_context_t * context,guint8 index)361 bool zhuyin_load_phrase_library(zhuyin_context_t * context,
362 guint8 index){
363 if (!(index < PHRASE_INDEX_LIBRARY_COUNT))
364 return false;
365
366 const pinyin_table_info_t * phrase_files =
367 context->m_system_table_info.get_default_tables();
368 FacadePhraseIndex * phrase_index = context->m_phrase_index;
369 const pinyin_table_info_t * table_info = phrase_files + index;
370
371 /* Only SYSTEM_FILE or USER_FILE is allowed here. */
372 assert(SYSTEM_FILE == table_info->m_file_type
373 || USER_FILE == table_info->m_file_type);
374
375 return _load_phrase_library(context->m_system_dir, context->m_user_dir,
376 phrase_index, table_info);
377 }
378
zhuyin_unload_phrase_library(zhuyin_context_t * context,guint8 index)379 bool zhuyin_unload_phrase_library(zhuyin_context_t * context,
380 guint8 index){
381 assert(index < PHRASE_INDEX_LIBRARY_COUNT);
382
383 /* default table. */
384 /* tsi.bin can't be unloaded. */
385 if (TSI_DICTIONARY == index)
386 return false;
387
388 context->m_phrase_index->unload(index);
389 return true;
390 }
391
zhuyin_begin_add_phrases(zhuyin_context_t * context,guint8 index)392 import_iterator_t * zhuyin_begin_add_phrases(zhuyin_context_t * context,
393 guint8 index){
394 import_iterator_t * iter = new import_iterator_t;
395 iter->m_context = context;
396 iter->m_phrase_index = index;
397 return iter;
398 }
399
_add_phrase(zhuyin_context_t * context,guint8 index,ChewingKeyVector keys,ucs4_t * phrase,glong phrase_length,gint count)400 static bool _add_phrase(zhuyin_context_t * context,
401 guint8 index,
402 ChewingKeyVector keys,
403 ucs4_t * phrase,
404 glong phrase_length,
405 gint count) {
406 /* if -1 == count, use the default value. */
407 const gint default_count = 5;
408 const guint32 unigram_factor = 3;
409 if (-1 == count)
410 count = default_count;
411
412 FacadePhraseTable3 * phrase_table = context->m_phrase_table;
413 FacadeChewingTable2 * pinyin_table = context->m_pinyin_table;
414 FacadePhraseIndex * phrase_index = context->m_phrase_index;
415
416 bool result = false;
417
418 /* check whether the phrase exists in phrase table */
419 phrase_token_t token = null_token;
420 GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
421
422 /* do phrase table search. */
423 PhraseTokens tokens;
424 memset(tokens, 0, sizeof(PhraseTokens));
425 phrase_index->prepare_tokens(tokens);
426 int retval = phrase_table->search(phrase_length, phrase, tokens);
427 int num = reduce_tokens(tokens, tokenarray);
428 phrase_index->destroy_tokens(tokens);
429
430 /* find the best token candidate. */
431 for (size_t i = 0; i < tokenarray->len; ++i) {
432 phrase_token_t candidate = g_array_index(tokenarray, phrase_token_t, i);
433 if (null_token == token) {
434 token = candidate;
435 continue;
436 }
437
438 if (PHRASE_INDEX_LIBRARY_INDEX(candidate) == index) {
439 /* only one phrase string per sub phrase index. */
440 assert(PHRASE_INDEX_LIBRARY_INDEX(token) != index);
441 token = candidate;
442 continue;
443 }
444 }
445 g_array_free(tokenarray, TRUE);
446
447 PhraseItem item;
448 /* check whether it exists in the same sub phrase index; */
449 if (null_token != token &&
450 PHRASE_INDEX_LIBRARY_INDEX(token) == index) {
451 /* if so, remove the phrase, add the pinyin for the phrase item,
452 then add it back;*/
453 phrase_index->get_phrase_item(token, item);
454 assert(phrase_length == item.get_phrase_length());
455 ucs4_t tmp_phrase[MAX_PHRASE_LENGTH];
456 item.get_phrase_string(tmp_phrase);
457 assert(0 == memcmp
458 (phrase, tmp_phrase, sizeof(ucs4_t) * phrase_length));
459
460 PhraseItem * removed_item = NULL;
461 retval = phrase_index->remove_phrase_item(token, removed_item);
462 if (ERROR_OK == retval) {
463 /* maybe check whether there are duplicated pronunciations here. */
464 removed_item->add_pronunciation((ChewingKey *)keys->data,
465 count);
466 phrase_index->add_phrase_item(token, removed_item);
467 delete removed_item;
468 result = true;
469 }
470 } else {
471 /* if not exists in the same sub phrase index,
472 get the maximum token,
473 then add it directly with maximum token + 1; */
474 PhraseIndexRange range;
475 retval = phrase_index->get_range(index, range);
476
477 if (ERROR_OK == retval) {
478 token = range.m_range_end;
479 if (0x00000000 == (token & PHRASE_MASK))
480 token++;
481
482 if (phrase_length == keys->len) { /* valid pinyin */
483 phrase_table->add_index(phrase_length, phrase, token);
484 pinyin_table->add_index
485 (keys->len, (ChewingKey *)(keys->data), token);
486
487 item.set_phrase_string(phrase_length, phrase);
488 item.add_pronunciation((ChewingKey *)(keys->data), count);
489 phrase_index->add_phrase_item(token, &item);
490 phrase_index->add_unigram_frequency(token,
491 count * unigram_factor);
492 result = true;
493 }
494 }
495 }
496
497 return result;
498 }
499
zhuyin_iterator_add_phrase(import_iterator_t * iter,const char * phrase,const char * pinyin,gint count)500 bool zhuyin_iterator_add_phrase(import_iterator_t * iter,
501 const char * phrase,
502 const char * pinyin,
503 gint count){
504 zhuyin_context_t * context = iter->m_context;
505 guint8 index = iter->m_phrase_index;
506
507 bool result = false;
508
509 if (NULL == phrase || NULL == pinyin)
510 return result;
511
512 glong phrase_length = 0;
513 ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &phrase_length, NULL);
514
515 zhuyin_option_t options = USE_TONE | FORCE_TONE;
516 ZhuyinDirectParser2 parser;
517 ChewingKeyVector keys =
518 g_array_new(FALSE, FALSE, sizeof(ChewingKey));
519 ChewingKeyRestVector key_rests =
520 g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest));
521
522 /* parse the pinyin. */
523 parser.parse(options, keys, key_rests, pinyin, strlen(pinyin));
524
525 if (phrase_length != keys->len)
526 return result;
527
528 if (0 == phrase_length || phrase_length >= MAX_PHRASE_LENGTH)
529 return result;
530
531 result = _add_phrase(context, index, keys,
532 ucs4_phrase, phrase_length, count);
533
534 g_array_free(key_rests, TRUE);
535 g_array_free(keys, TRUE);
536 g_free(ucs4_phrase);
537 return result;
538 }
539
zhuyin_end_add_phrases(import_iterator_t * iter)540 void zhuyin_end_add_phrases(import_iterator_t * iter){
541 /* compact the content memory chunk of phrase index. */
542 iter->m_context->m_phrase_index->compact();
543 iter->m_context->m_modified = true;
544 delete iter;
545 }
546
zhuyin_save(zhuyin_context_t * context)547 bool zhuyin_save(zhuyin_context_t * context){
548 if (!context->m_user_dir)
549 return false;
550
551 if (!context->m_modified)
552 return false;
553
554 context->m_phrase_index->compact();
555
556 const pinyin_table_info_t * phrase_files =
557 context->m_system_table_info.get_default_tables();
558
559 /* skip the reserved zero phrase library. */
560 for (size_t i = 1; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
561 PhraseIndexRange range;
562 int retval = context->m_phrase_index->get_range(i, range);
563
564 if (ERROR_NO_SUB_PHRASE_INDEX == retval)
565 continue;
566
567 const pinyin_table_info_t * table_info = phrase_files + i;
568
569 if (NOT_USED == table_info->m_file_type)
570 continue;
571
572 const char * userfilename = table_info->m_user_filename;
573
574 if (NULL == userfilename)
575 continue;
576
577 if (SYSTEM_FILE == table_info->m_file_type ||
578 DICTIONARY == table_info->m_file_type) {
579 /* system phrase library */
580 MemoryChunk * chunk = new MemoryChunk;
581 MemoryChunk * log = new MemoryChunk;
582 const char * systemfilename = table_info->m_system_filename;
583
584 /* check bin file in system dir. */
585 gchar * chunkfilename = g_build_filename(context->m_system_dir,
586 systemfilename, NULL);
587 #ifdef LIBPINYIN_USE_MMAP
588 if (!chunk->mmap(chunkfilename))
589 fprintf(stderr, "mmap %s failed!\n", chunkfilename);
590 #else
591 if (!chunk->load(chunkfilename))
592 fprintf(stderr, "open %s failed!\n", chunkfilename);
593 #endif
594
595 g_free(chunkfilename);
596 context->m_phrase_index->diff(i, chunk, log);
597
598 const char * userfilename = table_info->m_user_filename;
599 gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
600
601 gchar * tmppathname = g_build_filename(context->m_user_dir,
602 tmpfilename, NULL);
603 g_free(tmpfilename);
604
605 gchar * chunkpathname = g_build_filename(context->m_user_dir,
606 userfilename, NULL);
607 log->save(tmppathname);
608
609 int result = rename(tmppathname, chunkpathname);
610 if (0 != result)
611 fprintf(stderr, "rename %s to %s failed.\n",
612 tmppathname, chunkpathname);
613
614 g_free(chunkpathname);
615 g_free(tmppathname);
616 delete log;
617 }
618
619 if (USER_FILE == table_info->m_file_type) {
620 /* user phrase library */
621 MemoryChunk * chunk = new MemoryChunk;
622 context->m_phrase_index->store(i, chunk);
623
624 const char * userfilename = table_info->m_user_filename;
625 gchar * tmpfilename = g_strdup_printf("%s.tmp", userfilename);
626 gchar * tmppathname = g_build_filename(context->m_user_dir,
627 tmpfilename, NULL);
628 g_free(tmpfilename);
629
630 gchar * chunkpathname = g_build_filename(context->m_user_dir,
631 userfilename, NULL);
632
633 chunk->save(tmppathname);
634
635 int result = rename(tmppathname, chunkpathname);
636 if (0 != result)
637 fprintf(stderr, "rename %s to %s failed.\n",
638 tmppathname, chunkpathname);
639
640 g_free(chunkpathname);
641 g_free(tmppathname);
642 delete chunk;
643 }
644 }
645
646 /* save user pinyin table */
647 gchar * tmpfilename = g_build_filename
648 (context->m_user_dir, USER_PINYIN_INDEX ".tmp", NULL);
649 unlink(tmpfilename);
650 gchar * filename = g_build_filename
651 (context->m_user_dir, USER_PINYIN_INDEX, NULL);
652
653 context->m_pinyin_table->store(tmpfilename);
654
655 int result = rename(tmpfilename, filename);
656 if (0 != result)
657 fprintf(stderr, "rename %s to %s failed.\n",
658 tmpfilename, filename);
659
660 g_free(tmpfilename);
661 g_free(filename);
662
663 /* save user phrase table */
664 tmpfilename = g_build_filename
665 (context->m_user_dir, USER_PHRASE_INDEX ".tmp", NULL);
666 unlink(tmpfilename);
667 filename = g_build_filename
668 (context->m_user_dir, USER_PHRASE_INDEX, NULL);
669
670 context->m_phrase_table->store(tmpfilename);
671
672 result = rename(tmpfilename, filename);
673 if (0 != result)
674 fprintf(stderr, "rename %s to %s failed.\n",
675 tmpfilename, filename);
676
677 g_free(tmpfilename);
678 g_free(filename);
679
680 /* save user bi-gram */
681 tmpfilename = g_build_filename
682 (context->m_user_dir, USER_BIGRAM ".tmp", NULL);
683 unlink(tmpfilename);
684 filename = g_build_filename(context->m_user_dir, USER_BIGRAM, NULL);
685 context->m_user_bigram->save_db(tmpfilename);
686
687 result = rename(tmpfilename, filename);
688 if (0 != result)
689 fprintf(stderr, "rename %s to %s failed.\n",
690 tmpfilename, filename);
691
692 g_free(tmpfilename);
693 g_free(filename);
694
695 mark_version(context);
696
697 context->m_modified = false;
698 return true;
699 }
700
zhuyin_set_full_pinyin_scheme(zhuyin_context_t * context,FullPinyinScheme scheme)701 bool zhuyin_set_full_pinyin_scheme(zhuyin_context_t * context,
702 FullPinyinScheme scheme){
703 context->m_full_pinyin_scheme = scheme;
704 context->m_full_pinyin_parser->set_scheme(scheme);
705 return true;
706 }
707
zhuyin_set_chewing_scheme(zhuyin_context_t * context,ZhuyinScheme scheme)708 bool zhuyin_set_chewing_scheme(zhuyin_context_t * context,
709 ZhuyinScheme scheme){
710 delete context->m_chewing_parser;
711 context->m_chewing_parser = NULL;
712
713 switch(scheme) {
714 case ZHUYIN_STANDARD:
715 case ZHUYIN_IBM:
716 case ZHUYIN_GINYIEH:
717 case ZHUYIN_ETEN:
718 case ZHUYIN_STANDARD_DVORAK: {
719 ZhuyinSimpleParser2 * parser = new ZhuyinSimpleParser2();
720 parser->set_scheme(scheme);
721 context->m_chewing_parser = parser;
722 break;
723 }
724 case ZHUYIN_HSU:
725 case ZHUYIN_ETEN26:
726 case ZHUYIN_HSU_DVORAK: {
727 ZhuyinDiscreteParser2 * parser = new ZhuyinDiscreteParser2();
728 parser->set_scheme(scheme);
729 context->m_chewing_parser = parser;
730 break;
731 }
732 case ZHUYIN_DACHEN_CP26:
733 context->m_chewing_parser = new ZhuyinDaChenCP26Parser2();
734 break;
735 default:
736 assert(FALSE);
737 }
738 return true;
739 }
740
zhuyin_fini(zhuyin_context_t * context)741 void zhuyin_fini(zhuyin_context_t * context){
742 delete context->m_full_pinyin_parser;
743 delete context->m_chewing_parser;
744 delete context->m_pinyin_table;
745 delete context->m_phrase_table;
746 delete context->m_phrase_index;
747 delete context->m_system_bigram;
748 delete context->m_user_bigram;
749 delete context->m_pinyin_lookup;
750 delete context->m_phrase_lookup;
751
752 g_free(context->m_system_dir);
753 g_free(context->m_user_dir);
754 context->m_modified = false;
755
756 delete context;
757 }
758
zhuyin_mask_out(zhuyin_context_t * context,phrase_token_t mask,phrase_token_t value)759 bool zhuyin_mask_out(zhuyin_context_t * context,
760 phrase_token_t mask,
761 phrase_token_t value) {
762
763 context->m_pinyin_table->mask_out(mask, value);
764 context->m_phrase_table->mask_out(mask, value);
765 context->m_user_bigram->mask_out(mask, value);
766
767 const pinyin_table_info_t * phrase_files =
768 context->m_system_table_info.get_default_tables();
769
770 /* mask out the phrase index. */
771 for (size_t index = 1; index < PHRASE_INDEX_LIBRARY_COUNT; ++index) {
772 PhraseIndexRange range;
773 int retval = context->m_phrase_index->get_range(index, range);
774
775 if (ERROR_NO_SUB_PHRASE_INDEX == retval)
776 continue;
777
778 const pinyin_table_info_t * table_info = phrase_files + index;
779
780 if (NOT_USED == table_info->m_file_type)
781 continue;
782
783 const char * userfilename = table_info->m_user_filename;
784
785 if (NULL == userfilename)
786 continue;
787
788 if (SYSTEM_FILE == table_info->m_file_type ||
789 DICTIONARY == table_info->m_file_type) {
790 /* system phrase library */
791 MemoryChunk * chunk = new MemoryChunk;
792
793 const char * systemfilename = table_info->m_system_filename;
794 /* check bin file in system dir. */
795 gchar * chunkfilename = g_build_filename(context->m_system_dir,
796 systemfilename, NULL);
797
798 #ifdef LIBPINYIN_USE_MMAP
799 if (!chunk->mmap(chunkfilename))
800 fprintf(stderr, "mmap %s failed!\n", chunkfilename);
801 #else
802 if (!chunk->load(chunkfilename))
803 fprintf(stderr, "open %s failed!\n", chunkfilename);
804 #endif
805
806 g_free(chunkfilename);
807
808 context->m_phrase_index->load(index, chunk);
809
810 const char * userfilename = table_info->m_user_filename;
811
812 chunkfilename = g_build_filename(context->m_user_dir,
813 userfilename, NULL);
814
815 MemoryChunk * log = new MemoryChunk;
816 log->load(chunkfilename);
817 g_free(chunkfilename);
818
819 /* merge the chunk log with mask. */
820 context->m_phrase_index->merge_with_mask(index, log, mask, value);
821 }
822
823 if (USER_FILE == table_info->m_file_type) {
824 /* user phrase library */
825 context->m_phrase_index->mask_out(index, mask, value);
826 }
827 }
828
829 context->m_phrase_index->compact();
830 return true;
831 }
832
833 /* copy from options to context->m_options. */
zhuyin_set_options(zhuyin_context_t * context,zhuyin_option_t options)834 bool zhuyin_set_options(zhuyin_context_t * context,
835 zhuyin_option_t options){
836 context->m_options = options;
837 #if 0
838 context->m_pinyin_table->set_options(context->m_options);
839 context->m_pinyin_lookup->set_options(context->m_options);
840 #endif
841 return true;
842 }
843
844
zhuyin_alloc_instance(zhuyin_context_t * context)845 zhuyin_instance_t * zhuyin_alloc_instance(zhuyin_context_t * context){
846 zhuyin_instance_t * instance = new zhuyin_instance_t;
847 instance->m_context = context;
848
849 instance->m_prefixes = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
850
851 instance->m_parsed_len = 0;
852
853 instance->m_constraints = new ForwardPhoneticConstraints
854 (context->m_phrase_index);
855
856 instance->m_phrase_result = g_array_new
857 (TRUE, TRUE, sizeof(phrase_token_t));
858 instance->m_candidates =
859 g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
860
861 return instance;
862 }
863
_free_candidates(CandidateVector candidates)864 static bool _free_candidates(CandidateVector candidates) {
865 /* free candidates */
866 for (size_t i = 0; i < candidates->len; ++i) {
867 lookup_candidate_t * candidate = &g_array_index
868 (candidates, lookup_candidate_t, i);
869 g_free(candidate->m_phrase_string);
870 }
871 g_array_set_size(candidates, 0);
872
873 return true;
874 }
875
zhuyin_free_instance(zhuyin_instance_t * instance)876 void zhuyin_free_instance(zhuyin_instance_t * instance){
877 g_array_free(instance->m_prefixes, TRUE);
878 delete instance->m_constraints;
879 g_array_free(instance->m_phrase_result, TRUE);
880 _free_candidates(instance->m_candidates);
881 g_array_free(instance->m_candidates, TRUE);
882
883 delete instance;
884 }
885
zhuyin_update_constraints(zhuyin_instance_t * instance)886 static bool zhuyin_update_constraints(zhuyin_instance_t * instance){
887 PhoneticKeyMatrix & matrix = instance->m_matrix;
888 ForwardPhoneticConstraints * constraints = instance->m_constraints;
889
890 constraints->validate_constraint(&matrix);
891
892 return true;
893 }
894
zhuyin_guess_sentence(zhuyin_instance_t * instance)895 bool zhuyin_guess_sentence(zhuyin_instance_t * instance){
896 zhuyin_context_t * & context = instance->m_context;
897 PhoneticKeyMatrix & matrix = instance->m_matrix;
898
899 g_array_set_size(instance->m_prefixes, 0);
900 g_array_append_val(instance->m_prefixes, sentence_start);
901
902 zhuyin_update_constraints(instance);
903 bool retval = context->m_pinyin_lookup->get_nbest_match
904 (instance->m_prefixes,
905 &matrix,
906 instance->m_constraints,
907 &instance->m_nbest_results);
908
909 return retval;
910 }
911
_compute_prefixes(zhuyin_instance_t * instance,const char * prefix)912 static void _compute_prefixes(zhuyin_instance_t * instance,
913 const char * prefix){
914 zhuyin_context_t * & context = instance->m_context;
915 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
916
917 glong len_str = 0;
918 ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL);
919 GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
920
921 if (ucs4_str && len_str) {
922 /* add prefixes. */
923 for (ssize_t i = 1; i <= len_str; ++i) {
924 if (i > MAX_PHRASE_LENGTH)
925 break;
926
927 ucs4_t * start = ucs4_str + len_str - i;
928
929 PhraseTokens tokens;
930 memset(tokens, 0, sizeof(tokens));
931 phrase_index->prepare_tokens(tokens);
932 int result = context->m_phrase_table->search(i, start, tokens);
933 int num = reduce_tokens(tokens, tokenarray);
934 phrase_index->destroy_tokens(tokens);
935
936 if (result & SEARCH_OK)
937 g_array_append_vals(instance->m_prefixes,
938 tokenarray->data, tokenarray->len);
939 }
940 }
941 g_array_free(tokenarray, TRUE);
942 g_free(ucs4_str);
943 }
944
zhuyin_guess_sentence_with_prefix(zhuyin_instance_t * instance,const char * prefix)945 bool zhuyin_guess_sentence_with_prefix(zhuyin_instance_t * instance,
946 const char * prefix){
947 zhuyin_context_t * & context = instance->m_context;
948 PhoneticKeyMatrix & matrix = instance->m_matrix;
949
950 g_array_set_size(instance->m_prefixes, 0);
951 g_array_append_val(instance->m_prefixes, sentence_start);
952
953 _compute_prefixes(instance, prefix);
954
955 zhuyin_update_constraints(instance);
956 bool retval = context->m_pinyin_lookup->get_nbest_match
957 (instance->m_prefixes,
958 &matrix,
959 instance->m_constraints,
960 &instance->m_nbest_results);
961
962 return retval;
963 }
964
zhuyin_phrase_segment(zhuyin_instance_t * instance,const char * sentence)965 bool zhuyin_phrase_segment(zhuyin_instance_t * instance,
966 const char * sentence){
967 zhuyin_context_t * & context = instance->m_context;
968
969 const glong num_of_chars = g_utf8_strlen(sentence, -1);
970 glong ucs4_len = 0;
971 ucs4_t * ucs4_str = g_utf8_to_ucs4(sentence, -1, NULL, &ucs4_len, NULL);
972
973 g_return_val_if_fail(num_of_chars == ucs4_len, FALSE);
974
975 bool retval = context->m_phrase_lookup->get_best_match
976 (ucs4_len, ucs4_str, instance->m_phrase_result);
977
978 g_free(ucs4_str);
979 return retval;
980 }
981
982 /* the returned sentence should be freed by g_free(). */
zhuyin_get_sentence(zhuyin_instance_t * instance,char ** sentence)983 bool zhuyin_get_sentence(zhuyin_instance_t * instance,
984 char ** sentence){
985 zhuyin_context_t * & context = instance->m_context;
986 NBestMatchResults & results = instance->m_nbest_results;
987
988 if (0 == results.size())
989 return false;
990
991 MatchResult result = NULL;
992 assert(results.get_result(0, result));
993
994 bool retval = pinyin::convert_to_utf8
995 (context->m_phrase_index, result,
996 NULL, false, *sentence);
997
998 return retval;
999 }
1000
zhuyin_parse_full_pinyin(zhuyin_instance_t * instance,const char * onepinyin,ChewingKey * onekey)1001 bool zhuyin_parse_full_pinyin(zhuyin_instance_t * instance,
1002 const char * onepinyin,
1003 ChewingKey * onekey){
1004 zhuyin_context_t * & context = instance->m_context;
1005 zhuyin_option_t options = context->m_options;
1006
1007 /* disable the pinyin correction options. */
1008 options &= ~PINYIN_CORRECT_ALL;
1009
1010 int pinyin_len = strlen(onepinyin);
1011 bool retval = context->m_full_pinyin_parser->parse_one_key
1012 (options, *onekey, onepinyin, pinyin_len);
1013 return retval;
1014 }
1015
zhuyin_parse_more_full_pinyins(zhuyin_instance_t * instance,const char * pinyins)1016 size_t zhuyin_parse_more_full_pinyins(zhuyin_instance_t * instance,
1017 const char * pinyins){
1018 zhuyin_context_t * & context = instance->m_context;
1019 zhuyin_option_t options = context->m_options;
1020 PhoneticKeyMatrix & matrix = instance->m_matrix;
1021
1022 /* disable the pinyin correction options. */
1023 options &= ~PINYIN_CORRECT_ALL;
1024
1025 ChewingKeyVector keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey));
1026 ChewingKeyRestVector key_rests =
1027 g_array_new(TRUE, TRUE, sizeof(ChewingKeyRest));
1028
1029 int parsed_len = context->m_full_pinyin_parser->parse
1030 (options, keys,
1031 key_rests, pinyins, strlen(pinyins));
1032
1033 instance->m_parsed_len = parsed_len;
1034
1035 fill_matrix(&matrix, keys, key_rests, parsed_len);
1036
1037 fuzzy_syllable_step(options, &matrix);
1038
1039 g_array_free(key_rests, TRUE);
1040 g_array_free(keys, TRUE);
1041 return parsed_len;
1042 }
1043
zhuyin_parse_chewing(zhuyin_instance_t * instance,const char * onechewing,ChewingKey * onekey)1044 bool zhuyin_parse_chewing(zhuyin_instance_t * instance,
1045 const char * onechewing,
1046 ChewingKey * onekey){
1047 zhuyin_context_t * & context = instance->m_context;
1048 zhuyin_option_t options = context->m_options;
1049
1050 int chewing_len = strlen(onechewing);
1051 bool retval = context->m_chewing_parser->parse_one_key
1052 (options, *onekey, onechewing, chewing_len );
1053 return retval;
1054 }
1055
zhuyin_parse_more_chewings(zhuyin_instance_t * instance,const char * chewings)1056 size_t zhuyin_parse_more_chewings(zhuyin_instance_t * instance,
1057 const char * chewings){
1058 zhuyin_context_t * & context = instance->m_context;
1059 zhuyin_option_t options = context->m_options;
1060 PhoneticKeyMatrix & matrix = instance->m_matrix;
1061
1062 ChewingKeyVector keys = g_array_new(TRUE, TRUE, sizeof(ChewingKey));
1063 ChewingKeyRestVector key_rests =
1064 g_array_new(TRUE, TRUE, sizeof(ChewingKeyRest));
1065
1066 int parsed_len = context->m_chewing_parser->parse
1067 (options, keys,
1068 key_rests, chewings, strlen(chewings));
1069
1070 instance->m_parsed_len = parsed_len;
1071
1072 fill_matrix(&matrix, keys, key_rests, parsed_len);
1073
1074 fuzzy_syllable_step(options, &matrix);
1075
1076 g_array_free(key_rests, TRUE);
1077 g_array_free(keys, TRUE);
1078 return parsed_len;
1079 }
1080
zhuyin_get_parsed_input_length(zhuyin_instance_t * instance)1081 size_t zhuyin_get_parsed_input_length(zhuyin_instance_t * instance) {
1082 return instance->m_parsed_len;
1083 }
1084
zhuyin_in_chewing_keyboard(zhuyin_instance_t * instance,const char key,gchar *** symbols)1085 bool zhuyin_in_chewing_keyboard(zhuyin_instance_t * instance,
1086 const char key, gchar *** symbols) {
1087 zhuyin_context_t * & context = instance->m_context;
1088 zhuyin_option_t options = context->m_options;
1089
1090 return context->m_chewing_parser->in_chewing_scheme
1091 (options, key, *symbols);
1092 }
1093
_token_get_phrase(FacadePhraseIndex * phrase_index,phrase_token_t token,guint * len,gchar ** utf8_str)1094 static bool _token_get_phrase(FacadePhraseIndex * phrase_index,
1095 phrase_token_t token,
1096 guint * len,
1097 gchar ** utf8_str) {
1098 PhraseItem item;
1099 ucs4_t buffer[MAX_PHRASE_LENGTH];
1100
1101 int retval = phrase_index->get_phrase_item(token, item);
1102 if (ERROR_OK != retval)
1103 return false;
1104
1105 item.get_phrase_string(buffer);
1106 guint length = item.get_phrase_length();
1107 if (len)
1108 *len = length;
1109 if (utf8_str)
1110 *utf8_str = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
1111 return true;
1112 }
1113
1114 #if 0
1115 static gint compare_item_with_token(gconstpointer lhs,
1116 gconstpointer rhs) {
1117 lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
1118 lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
1119
1120 phrase_token_t token_lhs = item_lhs->m_token;
1121 phrase_token_t token_rhs = item_rhs->m_token;
1122
1123 return (token_lhs - token_rhs);
1124 }
1125 #endif
1126
compare_item_with_length_and_frequency(gconstpointer lhs,gconstpointer rhs)1127 static gint compare_item_with_length_and_frequency(gconstpointer lhs,
1128 gconstpointer rhs) {
1129 lookup_candidate_t * item_lhs = (lookup_candidate_t *)lhs;
1130 lookup_candidate_t * item_rhs = (lookup_candidate_t *)rhs;
1131
1132 guint8 len_lhs = item_lhs->m_phrase_length;
1133 guint8 len_rhs = item_rhs->m_phrase_length;
1134
1135 if (len_lhs != len_rhs)
1136 return -(len_lhs - len_rhs); /* in descendant order */
1137
1138 guint32 freq_lhs = item_lhs->m_freq;
1139 guint32 freq_rhs = item_rhs->m_freq;
1140
1141 return -(freq_lhs - freq_rhs); /* in descendant order */
1142 }
1143
_get_previous_token(zhuyin_instance_t * instance,size_t offset)1144 static phrase_token_t _get_previous_token(zhuyin_instance_t * instance,
1145 size_t offset) {
1146 zhuyin_context_t * context = instance->m_context;
1147 TokenVector prefixes = instance->m_prefixes;
1148 NBestMatchResults & results = instance->m_nbest_results;
1149
1150 phrase_token_t prev_token = null_token;
1151 ssize_t i;
1152
1153 if (0 == offset) {
1154 /* get previous token from prefixes. */
1155 prev_token = sentence_start;
1156 size_t prev_token_len = 0;
1157
1158 PhraseItem item;
1159 for (size_t i = 0; i < prefixes->len; ++i) {
1160 phrase_token_t token = g_array_index(prefixes, phrase_token_t, i);
1161 if (sentence_start == token)
1162 continue;
1163
1164 int retval = context->m_phrase_index->get_phrase_item(token, item);
1165 if (ERROR_OK == retval) {
1166 size_t token_len = item.get_phrase_length();
1167 if (token_len > prev_token_len) {
1168 /* found longer match, and save it. */
1169 prev_token = token;
1170 prev_token_len = token_len;
1171 }
1172 }
1173 }
1174 } else {
1175 /* get previous token from match results. */
1176 assert (0 < offset);
1177
1178 /* no nbest match result. */
1179 if (0 == results.size())
1180 return prev_token;
1181
1182 /* use the first candidate. */
1183 MatchResult result = NULL;
1184 assert(results.get_result(0, result));
1185
1186 phrase_token_t cur_token = g_array_index
1187 (result, phrase_token_t, offset);
1188 if (null_token != cur_token) {
1189 for (i = offset - 1; i >= 0; --i) {
1190 cur_token = g_array_index(result, phrase_token_t, i);
1191 if (null_token != cur_token) {
1192 prev_token = cur_token;
1193 break;
1194 }
1195 }
1196 }
1197 }
1198
1199 return prev_token;
1200 }
1201
_append_items(PhraseIndexRanges ranges,lookup_candidate_t * template_item,CandidateVector items)1202 static void _append_items(PhraseIndexRanges ranges,
1203 lookup_candidate_t * template_item,
1204 CandidateVector items) {
1205 /* reduce and append to a single GArray. */
1206 for (size_t m = 0; m < PHRASE_INDEX_LIBRARY_COUNT; ++m) {
1207 if (NULL == ranges[m])
1208 continue;
1209
1210 for (size_t n = 0; n < ranges[m]->len; ++n) {
1211 PhraseIndexRange * range =
1212 &g_array_index(ranges[m], PhraseIndexRange, n);
1213 for (size_t k = range->m_range_begin;
1214 k < range->m_range_end; ++k) {
1215 lookup_candidate_t item;
1216 item.m_candidate_type = template_item->m_candidate_type;
1217 item.m_token = k;
1218 item.m_begin = template_item->m_begin;
1219 item.m_end = template_item->m_end;
1220 item.m_freq = template_item->m_freq;
1221 g_array_append_val(items, item);
1222 }
1223 }
1224 }
1225 }
1226
_compute_frequency_of_items(zhuyin_context_t * context,phrase_token_t prev_token,SingleGram * merged_gram,CandidateVector items)1227 static void _compute_frequency_of_items(zhuyin_context_t * context,
1228 phrase_token_t prev_token,
1229 SingleGram * merged_gram,
1230 CandidateVector items) {
1231 pinyin_option_t & options = context->m_options;
1232 ssize_t i;
1233
1234 PhraseItem cached_item;
1235 /* compute all freqs. */
1236 for (i = 0; i < items->len; ++i) {
1237 lookup_candidate_t * item = &g_array_index
1238 (items, lookup_candidate_t, i);
1239 phrase_token_t & token = item->m_token;
1240
1241 gfloat bigram_poss = 0; guint32 total_freq = 0;
1242
1243 gfloat lambda = context->m_system_table_info.get_lambda();
1244
1245 if (options & DYNAMIC_ADJUST) {
1246 if (null_token != prev_token) {
1247 guint32 bigram_freq = 0;
1248 merged_gram->get_total_freq(total_freq);
1249 merged_gram->get_freq(token, bigram_freq);
1250 if (0 != total_freq)
1251 bigram_poss = bigram_freq / (gfloat)total_freq;
1252 }
1253 }
1254
1255 /* compute the m_freq. */
1256 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1257 phrase_index->get_phrase_item(token, cached_item);
1258 total_freq = phrase_index->get_phrase_index_total_freq();
1259 assert (0 < total_freq);
1260
1261 /* Note: possibility value <= 1.0. */
1262 guint32 freq = (lambda * bigram_poss +
1263 (1 - lambda) *
1264 cached_item.get_unigram_frequency() /
1265 (gfloat) total_freq) * 256 * 256 * 256;
1266 item->m_freq = freq;
1267 }
1268 }
1269
_prepend_sentence_candidates(zhuyin_instance_t * instance,CandidateVector candidates)1270 static bool _prepend_sentence_candidates(zhuyin_instance_t * instance,
1271 CandidateVector candidates) {
1272 const size_t size = instance->m_nbest_results.size();
1273
1274 /* check whether the nbest match candidate exists. */
1275 if (0 == size)
1276 return false;
1277
1278 /* prepend nbest match candidates to candidates. */
1279 for (ssize_t i = size - 1; i >= 0; --i) {
1280 lookup_candidate_t candidate;
1281 candidate.m_candidate_type = BEST_MATCH_CANDIDATE;
1282 g_array_prepend_val(candidates, candidate);
1283 }
1284
1285 return true;
1286 }
1287
_compute_phrase_length(zhuyin_context_t * context,CandidateVector candidates)1288 static bool _compute_phrase_length(zhuyin_context_t * context,
1289 CandidateVector candidates) {
1290 FacadePhraseIndex * phrase_index = context->m_phrase_index;
1291
1292 /* populate m_phrase_length in lookup_candidate_t. */
1293 PhraseItem item;
1294
1295 for(size_t i = 0; i < candidates->len; ++i) {
1296 lookup_candidate_t * candidate = &g_array_index
1297 (candidates, lookup_candidate_t, i);
1298
1299 switch(candidate->m_candidate_type) {
1300 case BEST_MATCH_CANDIDATE:
1301 assert(FALSE);
1302 case NORMAL_CANDIDATE_AFTER_CURSOR:
1303 case NORMAL_CANDIDATE_BEFORE_CURSOR: {
1304 phrase_index->get_phrase_item(candidate->m_token, item);
1305 candidate->m_phrase_length = item.get_phrase_length();
1306 break;
1307 }
1308 case ZOMBIE_CANDIDATE:
1309 assert(FALSE);
1310 }
1311 }
1312
1313 return true;
1314 }
1315
_compute_phrase_strings_of_items(zhuyin_instance_t * instance,CandidateVector candidates)1316 static bool _compute_phrase_strings_of_items(zhuyin_instance_t * instance,
1317 CandidateVector candidates) {
1318 /* populate m_phrase_string in lookup_candidate_t. */
1319
1320 for(size_t i = 0; i < candidates->len; ++i) {
1321 lookup_candidate_t * candidate = &g_array_index
1322 (candidates, lookup_candidate_t, i);
1323
1324 switch(candidate->m_candidate_type) {
1325 case BEST_MATCH_CANDIDATE: {
1326 gchar * sentence = NULL;
1327 zhuyin_get_sentence(instance, &sentence);
1328 candidate->m_phrase_string = sentence;
1329 break;
1330 }
1331 case NORMAL_CANDIDATE_AFTER_CURSOR:
1332 case NORMAL_CANDIDATE_BEFORE_CURSOR:
1333 _token_get_phrase
1334 (instance->m_context->m_phrase_index,
1335 candidate->m_token, NULL,
1336 &(candidate->m_phrase_string));
1337 break;
1338 case ZOMBIE_CANDIDATE:
1339 assert(FALSE);
1340 }
1341 }
1342
1343 return true;
1344 }
1345
compare_indexed_item_with_phrase_string(gconstpointer lhs,gconstpointer rhs,gpointer userdata)1346 static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
1347 gconstpointer rhs,
1348 gpointer userdata) {
1349 size_t index_lhs = *((size_t *) lhs);
1350 size_t index_rhs = *((size_t *) rhs);
1351 CandidateVector candidates = (CandidateVector) userdata;
1352
1353 lookup_candidate_t * candidate_lhs =
1354 &g_array_index(candidates, lookup_candidate_t, index_lhs);
1355 lookup_candidate_t * candidate_rhs =
1356 &g_array_index(candidates, lookup_candidate_t, index_rhs);
1357
1358 return -strcmp(candidate_lhs->m_phrase_string,
1359 candidate_rhs->m_phrase_string); /* in descendant order */
1360 }
1361
1362
_remove_duplicated_items_by_phrase_string(zhuyin_instance_t * instance,CandidateVector candidates)1363 static bool _remove_duplicated_items_by_phrase_string
1364 (zhuyin_instance_t * instance,
1365 CandidateVector candidates) {
1366 size_t i;
1367 /* create the GArray of indexed item */
1368 GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
1369 for (i = 0; i < candidates->len; ++i)
1370 g_array_append_val(indices, i);
1371
1372 /* sort the indices array by phrase array */
1373 g_array_sort_with_data
1374 (indices, compare_indexed_item_with_phrase_string, candidates);
1375
1376 /* mark duplicated items as zombie candidate */
1377 lookup_candidate_t * cur_item, * saved_item = NULL;
1378 for (i = 0; i < indices->len; ++i) {
1379 size_t cur_index = g_array_index(indices, size_t, i);
1380 cur_item = &g_array_index(candidates, lookup_candidate_t, cur_index);
1381
1382 /* handle the first candidate */
1383 if (NULL == saved_item) {
1384 saved_item = cur_item;
1385 continue;
1386 }
1387
1388 if (0 == strcmp(saved_item->m_phrase_string,
1389 cur_item->m_phrase_string)) {
1390 /* found duplicated candidates */
1391
1392 /* keep nbest match candidate */
1393 if (BEST_MATCH_CANDIDATE == saved_item->m_candidate_type) {
1394 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1395 continue;
1396 }
1397
1398 if (BEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
1399 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1400 saved_item = cur_item;
1401 continue;
1402 }
1403
1404 /* keep the higher possiblity one
1405 to quickly move the word forward in the candidate list */
1406 if (cur_item->m_freq > saved_item->m_freq) {
1407 /* find better candidate */
1408 saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
1409 saved_item = cur_item;
1410 continue;
1411 } else {
1412 cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
1413 continue;
1414 }
1415 } else {
1416 /* keep the current candidate */
1417 saved_item = cur_item;
1418 }
1419 }
1420
1421 g_array_free(indices, TRUE);
1422
1423 /* remove zombie candidate from the returned candidates */
1424 for (i = 0; i < candidates->len; ++i) {
1425 lookup_candidate_t * candidate = &g_array_index
1426 (candidates, lookup_candidate_t, i);
1427
1428 if (ZOMBIE_CANDIDATE == candidate->m_candidate_type) {
1429 g_free(candidate->m_phrase_string);
1430 g_array_remove_index(candidates, i);
1431 i--;
1432 }
1433 }
1434
1435 return true;
1436 }
1437
1438 /* offset must at the beginning of zero ChewingKey "'". */
_check_offset(PhoneticKeyMatrix & matrix,size_t offset)1439 static bool _check_offset(PhoneticKeyMatrix & matrix, size_t offset) {
1440 const size_t start = offset;
1441
1442 ChewingKey key; ChewingKeyRest key_rest;
1443 const ChewingKey zero_key;
1444
1445 if (start > 0) {
1446 const size_t index = start - 1;
1447 const size_t size = matrix.get_column_size(index);
1448 if (1 == size) {
1449 /* assume only one zero ChewingKey "'" here, but no check. */
1450 matrix.get_item(index, 0, key, key_rest);
1451 assert(zero_key != key);
1452 }
1453 }
1454
1455 return true;
1456 }
1457
zhuyin_guess_candidates_after_cursor(zhuyin_instance_t * instance,size_t offset)1458 bool zhuyin_guess_candidates_after_cursor(zhuyin_instance_t * instance,
1459 size_t offset) {
1460
1461 zhuyin_context_t * & context = instance->m_context;
1462 zhuyin_option_t & options = context->m_options;
1463 PhoneticKeyMatrix & matrix = instance->m_matrix;
1464 CandidateVector candidates = instance->m_candidates;
1465
1466 _free_candidates(candidates);
1467
1468 if (0 == matrix.size())
1469 return false;
1470
1471 /* lookup the previous token here. */
1472 phrase_token_t prev_token = null_token;
1473
1474 if (options & DYNAMIC_ADJUST) {
1475 prev_token = _get_previous_token(instance, offset);
1476 }
1477
1478 SingleGram merged_gram;
1479 SingleGram * system_gram = NULL, * user_gram = NULL;
1480
1481 if (options & DYNAMIC_ADJUST) {
1482 if (null_token != prev_token) {
1483 context->m_system_bigram->load(prev_token, system_gram);
1484 context->m_user_bigram->load(prev_token, user_gram);
1485 merge_single_gram(&merged_gram, system_gram, user_gram);
1486 }
1487 }
1488
1489 PhraseIndexRanges ranges;
1490 memset(ranges, 0, sizeof(ranges));
1491 context->m_phrase_index->prepare_ranges(ranges);
1492
1493 _check_offset(matrix, offset);
1494
1495 /* matrix reserved one extra slot. */
1496 const size_t start = offset;
1497 for (size_t end = start + 1; end < matrix.size(); ++end) {
1498 /* do pinyin search. */
1499 context->m_phrase_index->clear_ranges(ranges);
1500 int retval = search_matrix(context->m_pinyin_table, &matrix,
1501 start, end, ranges);
1502
1503 if ( !(retval & SEARCH_OK) )
1504 continue;
1505
1506 lookup_candidate_t template_item;
1507 template_item.m_begin = start; template_item.m_end = end;
1508 _append_items(ranges, &template_item, candidates);
1509
1510 if ( !(retval & SEARCH_CONTINUED) )
1511 break;
1512 }
1513
1514 context->m_phrase_index->destroy_ranges(ranges);
1515 if (system_gram)
1516 delete system_gram;
1517 if (user_gram)
1518 delete user_gram;
1519
1520 /* post process to sort the candidates */
1521
1522 _compute_phrase_length(context, candidates);
1523
1524 _compute_frequency_of_items(context, prev_token, &merged_gram, candidates);
1525
1526 /* sort the candidates by length and frequency. */
1527 g_array_sort(candidates, compare_item_with_length_and_frequency);
1528
1529 /* post process to remove duplicated candidates */
1530
1531 _prepend_sentence_candidates(instance, instance->m_candidates);
1532
1533 _compute_phrase_strings_of_items(instance, instance->m_candidates);
1534
1535 _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates);
1536
1537 return true;
1538 }
1539
zhuyin_guess_candidates_before_cursor(zhuyin_instance_t * instance,size_t offset)1540 bool zhuyin_guess_candidates_before_cursor(zhuyin_instance_t * instance,
1541 size_t offset) {
1542 zhuyin_context_t * & context = instance->m_context;
1543 zhuyin_option_t & options = context->m_options;
1544 PhoneticKeyMatrix & matrix = instance->m_matrix;
1545 CandidateVector candidates = instance->m_candidates;
1546
1547 _free_candidates(candidates);
1548
1549 if (0 == matrix.size())
1550 return false;
1551
1552 PhraseIndexRanges ranges;
1553 memset(ranges, 0, sizeof(ranges));
1554 context->m_phrase_index->prepare_ranges(ranges);
1555
1556 _check_offset(matrix, offset);
1557
1558 GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t));
1559
1560 /* matrix reserved one extra slot. */
1561 for (size_t len = offset; len >= 1; --len) {
1562 _free_candidates(items);
1563 const size_t start = offset - len;
1564
1565 /* lookup the previous token here. */
1566 phrase_token_t prev_token = null_token;
1567
1568 if (options & DYNAMIC_ADJUST) {
1569 prev_token = _get_previous_token(instance, start);
1570 }
1571
1572 SingleGram merged_gram;
1573 SingleGram * system_gram = NULL, * user_gram = NULL;
1574
1575 if (options & DYNAMIC_ADJUST) {
1576 if (null_token != prev_token) {
1577 context->m_system_bigram->load(prev_token, system_gram);
1578 context->m_user_bigram->load(prev_token, user_gram);
1579 merge_single_gram(&merged_gram, system_gram, user_gram);
1580 }
1581 }
1582
1583 /* do pinyin search. */
1584 context->m_phrase_index->clear_ranges(ranges);
1585 int retval = search_matrix(context->m_pinyin_table, &matrix,
1586 start, offset, ranges);
1587
1588 if ( !(retval & SEARCH_OK) )
1589 continue;
1590
1591 lookup_candidate_t template_item;
1592 template_item.m_candidate_type = NORMAL_CANDIDATE_BEFORE_CURSOR;
1593 template_item.m_begin = start; template_item.m_end = offset;
1594 _append_items(ranges, &template_item, items);
1595
1596 if (system_gram)
1597 delete system_gram;
1598 if (user_gram)
1599 delete user_gram;
1600
1601 /* post process to sort the items */
1602
1603 _compute_phrase_length(context, items);
1604
1605 _compute_frequency_of_items(context, prev_token, &merged_gram, items);
1606
1607 /* sort the items by length and frequency. */
1608 g_array_sort(items, compare_item_with_length_and_frequency);
1609
1610 g_array_append_vals(candidates, items->data, items->len);
1611
1612 #if 0
1613 /* no continue information. */
1614 if ( !(retval & SEARCH_CONTINUED) )
1615 break;
1616 #endif
1617 }
1618
1619 _free_candidates(items);
1620 context->m_phrase_index->destroy_ranges(ranges);
1621
1622 /* post process to remove duplicated candidates */
1623
1624 _prepend_sentence_candidates(instance, instance->m_candidates);
1625
1626 _compute_phrase_strings_of_items(instance, instance->m_candidates);
1627
1628 _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates);
1629
1630 return true;
1631 }
1632
zhuyin_choose_candidate(zhuyin_instance_t * instance,size_t offset,lookup_candidate_t * candidate)1633 int zhuyin_choose_candidate(zhuyin_instance_t * instance,
1634 size_t offset,
1635 lookup_candidate_t * candidate){
1636 zhuyin_context_t * & context = instance->m_context;
1637 PhoneticKeyMatrix & matrix = instance->m_matrix;
1638 ForwardPhoneticConstraints * constraints = instance->m_constraints;
1639 NBestMatchResults & results = instance->m_nbest_results;
1640
1641 if (BEST_MATCH_CANDIDATE == candidate->m_candidate_type)
1642 return matrix.size() - 1;
1643
1644 /* sync m_constraints to the length of m_pinyin_keys. */
1645 bool retval = constraints->validate_constraint(&matrix);
1646
1647 if (NORMAL_CANDIDATE_AFTER_CURSOR == candidate->m_candidate_type) {
1648 phrase_token_t token = candidate->m_token;
1649 guint8 len = constraints->add_constraint
1650 (candidate->m_begin, candidate->m_end, token);
1651 offset = candidate->m_end;
1652 }
1653
1654 if (NORMAL_CANDIDATE_BEFORE_CURSOR == candidate->m_candidate_type) {
1655 phrase_token_t token = candidate->m_token;
1656 guint8 len = constraints->add_constraint
1657 (candidate->m_begin, candidate->m_end, token);
1658 offset = candidate->m_begin;
1659 }
1660
1661 /* safe guard: validate the m_constraints again. */
1662 retval = constraints->validate_constraint(&matrix);
1663
1664 return offset;
1665 }
1666
zhuyin_clear_constraint(zhuyin_instance_t * instance,size_t offset)1667 bool zhuyin_clear_constraint(zhuyin_instance_t * instance,
1668 size_t offset){
1669 ForwardPhoneticConstraints * constraints = instance->m_constraints;
1670
1671 bool retval = constraints->clear_constraint(offset);
1672
1673 return retval;
1674 }
1675
zhuyin_lookup_tokens(zhuyin_instance_t * instance,const char * phrase,GArray * tokenarray)1676 bool zhuyin_lookup_tokens(zhuyin_instance_t * instance,
1677 const char * phrase, GArray * tokenarray){
1678 zhuyin_context_t * & context = instance->m_context;
1679 FacadePhraseIndex * & phrase_index = context->m_phrase_index;
1680
1681 glong ucs4_len = 0;
1682 ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &ucs4_len, NULL);
1683
1684 PhraseTokens tokens;
1685 memset(tokens, 0, sizeof(PhraseTokens));
1686 phrase_index->prepare_tokens(tokens);
1687 int retval = context->m_phrase_table->search(ucs4_len, ucs4_phrase, tokens);
1688 int num = reduce_tokens(tokens, tokenarray);
1689 phrase_index->destroy_tokens(tokens);
1690
1691 return SEARCH_OK & retval;
1692 }
1693
zhuyin_train(zhuyin_instance_t * instance)1694 bool zhuyin_train(zhuyin_instance_t * instance){
1695 if (!instance->m_context->m_user_dir)
1696 return false;
1697
1698 zhuyin_context_t * context = instance->m_context;
1699 PhoneticKeyMatrix & matrix = instance->m_matrix;
1700 NBestMatchResults & results = instance->m_nbest_results;
1701
1702 if (0 == results.size())
1703 return false;
1704
1705 context->m_modified = true;
1706
1707 MatchResult result = NULL;
1708 assert(results.get_result(0, result));
1709
1710 bool retval = context->m_pinyin_lookup->train_result3
1711 (&matrix, instance->m_constraints, result);
1712
1713 return retval;
1714 }
1715
zhuyin_reset(zhuyin_instance_t * instance)1716 bool zhuyin_reset(zhuyin_instance_t * instance){
1717 instance->m_parsed_len = 0;
1718 instance->m_matrix.clear_all();
1719
1720 g_array_set_size(instance->m_prefixes, 0);
1721
1722 instance->m_constraints->clear();
1723 instance->m_nbest_results.clear();
1724 g_array_set_size(instance->m_phrase_result, 0);
1725 _free_candidates(instance->m_candidates);
1726
1727 return true;
1728 }
1729
zhuyin_get_zhuyin_string(zhuyin_instance_t * instance,ChewingKey * key,gchar ** utf8_str)1730 bool zhuyin_get_zhuyin_string(zhuyin_instance_t * instance,
1731 ChewingKey * key,
1732 gchar ** utf8_str) {
1733 *utf8_str = NULL;
1734 if (0 == key->get_table_index())
1735 return false;
1736
1737 *utf8_str = key->get_zhuyin_string();
1738 return true;
1739 }
1740
zhuyin_get_pinyin_string(zhuyin_instance_t * instance,ChewingKey * key,gchar ** utf8_str)1741 bool zhuyin_get_pinyin_string(zhuyin_instance_t * instance,
1742 ChewingKey * key,
1743 gchar ** utf8_str) {
1744 zhuyin_context_t * context = instance->m_context;
1745 FullPinyinScheme scheme = context->m_full_pinyin_scheme;
1746
1747 *utf8_str = NULL;
1748 if (0 == key->get_table_index())
1749 return false;
1750
1751 switch(scheme) {
1752 case FULL_PINYIN_HANYU:
1753 *utf8_str = key->get_pinyin_string();
1754 break;
1755 case FULL_PINYIN_LUOMA:
1756 *utf8_str = key->get_luoma_pinyin_string();
1757 break;
1758 case FULL_PINYIN_SECONDARY_ZHUYIN:
1759 *utf8_str = key->get_secondary_zhuyin_string();
1760 break;
1761 }
1762
1763 return true;
1764 }
1765
zhuyin_token_get_phrase(zhuyin_instance_t * instance,phrase_token_t token,guint * len,gchar ** utf8_str)1766 bool zhuyin_token_get_phrase(zhuyin_instance_t * instance,
1767 phrase_token_t token,
1768 guint * len,
1769 gchar ** utf8_str) {
1770 zhuyin_context_t * & context = instance->m_context;
1771
1772 return _token_get_phrase(context->m_phrase_index,
1773 token, len, utf8_str);
1774 }
1775
zhuyin_token_get_n_pronunciation(zhuyin_instance_t * instance,phrase_token_t token,guint * num)1776 bool zhuyin_token_get_n_pronunciation(zhuyin_instance_t * instance,
1777 phrase_token_t token,
1778 guint * num){
1779 *num = 0;
1780 zhuyin_context_t * & context = instance->m_context;
1781 PhraseItem item;
1782
1783 int retval = context->m_phrase_index->get_phrase_item(token, item);
1784 if (ERROR_OK != retval)
1785 return false;
1786
1787 *num = item.get_n_pronunciation();
1788 return true;
1789 }
1790
zhuyin_token_get_nth_pronunciation(zhuyin_instance_t * instance,phrase_token_t token,guint nth,ChewingKeyVector keys)1791 bool zhuyin_token_get_nth_pronunciation(zhuyin_instance_t * instance,
1792 phrase_token_t token,
1793 guint nth,
1794 ChewingKeyVector keys){
1795 g_array_set_size(keys, 0);
1796 zhuyin_context_t * & context = instance->m_context;
1797 PhraseItem item;
1798 ChewingKey buffer[MAX_PHRASE_LENGTH];
1799 guint32 freq = 0;
1800
1801 int retval = context->m_phrase_index->get_phrase_item(token, item);
1802 if (ERROR_OK != retval)
1803 return false;
1804
1805 item.get_nth_pronunciation(nth, buffer, freq);
1806 guint8 len = item.get_phrase_length();
1807 g_array_append_vals(keys, buffer, len);
1808 return true;
1809 }
1810
zhuyin_token_get_unigram_frequency(zhuyin_instance_t * instance,phrase_token_t token,guint * freq)1811 bool zhuyin_token_get_unigram_frequency(zhuyin_instance_t * instance,
1812 phrase_token_t token,
1813 guint * freq) {
1814 *freq = 0;
1815 zhuyin_context_t * & context = instance->m_context;
1816 PhraseItem item;
1817
1818 int retval = context->m_phrase_index->get_phrase_item(token, item);
1819 if (ERROR_OK != retval)
1820 return false;
1821
1822 *freq = item.get_unigram_frequency();
1823 return true;
1824 }
1825
zhuyin_token_add_unigram_frequency(zhuyin_instance_t * instance,phrase_token_t token,guint delta)1826 bool zhuyin_token_add_unigram_frequency(zhuyin_instance_t * instance,
1827 phrase_token_t token,
1828 guint delta){
1829 zhuyin_context_t * & context = instance->m_context;
1830 int retval = context->m_phrase_index->add_unigram_frequency
1831 (token, delta);
1832 return ERROR_OK == retval;
1833 }
1834
zhuyin_get_n_candidate(zhuyin_instance_t * instance,guint * num)1835 bool zhuyin_get_n_candidate(zhuyin_instance_t * instance,
1836 guint * num) {
1837 *num = instance->m_candidates->len;
1838 return true;
1839 }
1840
zhuyin_get_candidate(zhuyin_instance_t * instance,guint index,lookup_candidate_t ** candidate)1841 bool zhuyin_get_candidate(zhuyin_instance_t * instance,
1842 guint index,
1843 lookup_candidate_t ** candidate) {
1844 CandidateVector & candidates = instance->m_candidates;
1845
1846 *candidate = NULL;
1847
1848 if (index >= candidates->len)
1849 return false;
1850
1851 *candidate = &g_array_index(candidates, lookup_candidate_t, index);
1852
1853 return true;
1854 }
1855
zhuyin_get_candidate_type(zhuyin_instance_t * instance,lookup_candidate_t * candidate,lookup_candidate_type_t * type)1856 bool zhuyin_get_candidate_type(zhuyin_instance_t * instance,
1857 lookup_candidate_t * candidate,
1858 lookup_candidate_type_t * type) {
1859 *type = candidate->m_candidate_type;
1860 return true;
1861 }
1862
zhuyin_get_candidate_string(zhuyin_instance_t * instance,lookup_candidate_t * candidate,const gchar ** utf8_str)1863 bool zhuyin_get_candidate_string(zhuyin_instance_t * instance,
1864 lookup_candidate_t * candidate,
1865 const gchar ** utf8_str) {
1866 *utf8_str = candidate->m_phrase_string;
1867 return true;
1868 }
1869
1870 #if 0
1871 bool zhuyin_get_n_zhuyin(zhuyin_instance_t * instance,
1872 guint * num) {
1873 *num = 0;
1874
1875 if (instance->m_pinyin_keys->len !=
1876 instance->m_pinyin_key_rests->len)
1877 return false;
1878
1879 *num = instance->m_pinyin_keys->len;
1880 return true;
1881 }
1882 #endif
1883
zhuyin_get_zhuyin_key(zhuyin_instance_t * instance,size_t offset,ChewingKey ** ppkey)1884 bool zhuyin_get_zhuyin_key(zhuyin_instance_t * instance,
1885 size_t offset,
1886 ChewingKey ** ppkey) {
1887 PhoneticKeyMatrix & matrix = instance->m_matrix;
1888 *ppkey = NULL;
1889
1890 if (offset >= matrix.size() - 1)
1891 return false;
1892
1893 if (0 == matrix.get_column_size(offset))
1894 return false;
1895
1896 _check_offset(matrix, offset);
1897
1898 static ChewingKey key;
1899 ChewingKeyRest key_rest;
1900 matrix.get_item(offset, 0, key, key_rest);
1901
1902 *ppkey = &key;
1903 return true;
1904 }
1905
zhuyin_get_zhuyin_key_rest(zhuyin_instance_t * instance,size_t offset,ChewingKeyRest ** ppkey_rest)1906 bool zhuyin_get_zhuyin_key_rest(zhuyin_instance_t * instance,
1907 size_t offset,
1908 ChewingKeyRest ** ppkey_rest) {
1909 PhoneticKeyMatrix & matrix = instance->m_matrix;
1910 *ppkey_rest = NULL;
1911
1912 if (offset >= matrix.size() - 1)
1913 return false;
1914
1915 if (0 == matrix.get_column_size(offset))
1916 return false;
1917
1918 _check_offset(matrix, offset);
1919
1920 ChewingKey key;
1921 static ChewingKeyRest key_rest;
1922 matrix.get_item(offset, 0, key, key_rest);
1923
1924 *ppkey_rest = &key_rest;
1925 return true;
1926 }
1927
zhuyin_get_zhuyin_key_rest_positions(zhuyin_instance_t * instance,ChewingKeyRest * key_rest,guint16 * begin,guint16 * end)1928 bool zhuyin_get_zhuyin_key_rest_positions(zhuyin_instance_t * instance,
1929 ChewingKeyRest * key_rest,
1930 guint16 * begin, guint16 * end) {
1931 if (begin)
1932 *begin = key_rest->m_raw_begin;
1933
1934 if (end)
1935 *end = key_rest->m_raw_end;
1936
1937 return true;
1938 }
1939
zhuyin_get_zhuyin_key_rest_length(zhuyin_instance_t * instance,ChewingKeyRest * key_rest,guint16 * length)1940 bool zhuyin_get_zhuyin_key_rest_length(zhuyin_instance_t * instance,
1941 ChewingKeyRest * key_rest,
1942 guint16 * length) {
1943 *length = key_rest->length();
1944 return true;
1945 }
1946
1947 /* when lookup offset:
1948 get the previous non-zero ChewingKey. */
zhuyin_get_zhuyin_offset(zhuyin_instance_t * instance,size_t cursor,size_t * poffset)1949 bool zhuyin_get_zhuyin_offset(zhuyin_instance_t * instance,
1950 size_t cursor,
1951 size_t * poffset) {
1952 PhoneticKeyMatrix & matrix = instance->m_matrix;
1953 size_t offset = std_lite::min(cursor, instance->m_parsed_len);
1954
1955 /* find the first ChewingKey. */
1956 for (; offset > 0; --offset) {
1957 const size_t size = matrix.get_column_size(offset);
1958
1959 if (size > 0)
1960 break;
1961 }
1962
1963 _check_offset(matrix, offset);
1964
1965 *poffset = offset;
1966 return true;
1967 }
1968
zhuyin_get_left_zhuyin_offset(zhuyin_instance_t * instance,size_t offset,size_t * pleft)1969 bool zhuyin_get_left_zhuyin_offset(zhuyin_instance_t * instance,
1970 size_t offset,
1971 size_t * pleft) {
1972 PhoneticKeyMatrix & matrix = instance->m_matrix;
1973 _check_offset(matrix, offset);
1974
1975 /* find the ChewingKey ends at offset. */
1976 size_t left = offset > 0 ? offset - 1 : 0;
1977
1978 ChewingKey key; ChewingKeyRest key_rest;
1979 for (; left > 0; --left) {
1980 const size_t size = matrix.get_column_size(left);
1981
1982 size_t i = 0;
1983 for (; i < size; ++i) {
1984 matrix.get_item(left, i, key, key_rest);
1985
1986 if (offset == key_rest.m_raw_end)
1987 break;
1988 }
1989
1990 if (i < size)
1991 break;
1992 }
1993
1994 _check_offset(matrix, left);
1995
1996 *pleft = left;
1997 return true;
1998 }
1999
zhuyin_get_right_zhuyin_offset(zhuyin_instance_t * instance,size_t offset,size_t * pright)2000 bool zhuyin_get_right_zhuyin_offset(zhuyin_instance_t * instance,
2001 size_t offset,
2002 size_t * pright) {
2003 PhoneticKeyMatrix & matrix = instance->m_matrix;
2004 _check_offset(matrix, offset);
2005
2006 /* find the first non-zero ChewingKey. */
2007 size_t right = offset;
2008
2009 ChewingKey key; ChewingKeyRest key_rest;
2010 for (size_t index = right; index < matrix.size() - 1; ++index) {
2011 const size_t size = matrix.get_column_size(index);
2012
2013 if (1 != size)
2014 break;
2015
2016 matrix.get_item(index, 0, key, key_rest);
2017 break;
2018 }
2019
2020 if (0 == matrix.get_column_size(right))
2021 return false;
2022
2023 matrix.get_item(right, 0, key, key_rest);
2024 right = key_rest.m_raw_end;
2025 _check_offset(matrix, right);
2026
2027 *pright = right;
2028 return true;
2029 }
2030
_pre_compute_tokens(zhuyin_context_t * context,TokenVector cached_tokens,ucs4_t * phrase,size_t phrase_length)2031 static bool _pre_compute_tokens(zhuyin_context_t * context,
2032 TokenVector cached_tokens,
2033 ucs4_t * phrase,
2034 size_t phrase_length) {
2035 FacadePhraseIndex * phrase_index = context->m_phrase_index;
2036 FacadePhraseTable3 * phrase_table = context->m_phrase_table;
2037
2038 /* do phrase table search. */
2039 PhraseTokens tokens;
2040 memset(tokens, 0, sizeof(PhraseTokens));
2041 phrase_index->prepare_tokens(tokens);
2042
2043 for (size_t i = 0; i < phrase_length; ++i) {
2044 phrase_token_t token = null_token;
2045 ucs4_t character = phrase[i];
2046
2047 phrase_index->clear_tokens(tokens);
2048 int retval = phrase_table->search(1, &character, tokens);
2049
2050 int num = get_first_token(tokens, token);
2051 /* en-counter un-known character, such as the emoji unicode. */
2052 if (0 == num) {
2053 phrase_index->destroy_tokens(tokens);
2054 return false;
2055 }
2056
2057 g_array_append_val(cached_tokens, token);
2058 }
2059
2060 phrase_index->destroy_tokens(tokens);
2061
2062 return true;
2063 }
2064
_get_char_offset_recur(zhuyin_instance_t * instance,TokenVector cached_tokens,size_t start,size_t offset,size_t * plength)2065 static bool _get_char_offset_recur(zhuyin_instance_t * instance,
2066 TokenVector cached_tokens,
2067 size_t start,
2068 size_t offset,
2069 size_t * plength) {
2070 zhuyin_context_t * context = instance->m_context;
2071 PhoneticKeyMatrix & matrix = instance->m_matrix;
2072 FacadePhraseIndex * phrase_index = context->m_phrase_index;
2073 size_t length = *plength;
2074
2075 if (start > offset)
2076 return true;
2077
2078 const size_t size = matrix.get_column_size(start);
2079 /* assume pinyin parsers will filter invalid keys. */
2080 assert(size > 0);
2081
2082 bool result = false;
2083
2084 PhraseItem item;
2085 for (size_t i = 0; i < size; ++i) {
2086 ChewingKey key; ChewingKeyRest key_rest;
2087 matrix.get_item(start, i, key, key_rest);
2088
2089 const size_t newstart = key_rest.m_raw_end;
2090
2091 /* check pronunciation */
2092 phrase_token_t token = g_array_index
2093 (cached_tokens, phrase_token_t, length);
2094 phrase_index->get_phrase_item(token, item);
2095
2096 gfloat pinyin_poss = item.get_pronunciation_possibility(&key);
2097 if (pinyin_poss < FLT_EPSILON)
2098 continue;
2099
2100 if (newstart > offset)
2101 return true;
2102
2103 ++length;
2104
2105 result = _get_char_offset_recur
2106 (instance, cached_tokens, newstart, offset, &length);
2107 if (result) {
2108 *plength = length;
2109 return result;
2110 }
2111
2112 --length;
2113 }
2114
2115 return result;
2116 }
2117
zhuyin_get_character_offset(zhuyin_instance_t * instance,const char * phrase,size_t offset,size_t * plength)2118 bool zhuyin_get_character_offset(zhuyin_instance_t * instance,
2119 const char * phrase,
2120 size_t offset,
2121 size_t * plength) {
2122 zhuyin_context_t * context = instance->m_context;
2123 PhoneticKeyMatrix & matrix = instance->m_matrix;
2124
2125 if (0 == matrix.size())
2126 return false;
2127
2128 assert(offset < matrix.size());
2129 _check_offset(matrix, offset);
2130
2131 if (NULL == phrase)
2132 return false;
2133
2134 glong phrase_length = 0;
2135 ucs4_t * ucs4_phrase = g_utf8_to_ucs4(phrase, -1, NULL, &phrase_length, NULL);
2136
2137 if (0 == phrase_length)
2138 return false;
2139
2140 size_t length = 0;
2141 const size_t start = 0;
2142
2143 /* pre-compute the tokens vector from phrase. */
2144 TokenVector cached_tokens = g_array_new(TRUE, TRUE, sizeof(phrase_token_t));
2145
2146 bool retval = _pre_compute_tokens
2147 (context, cached_tokens, ucs4_phrase, phrase_length);
2148
2149 if (!retval) {
2150 g_array_free(cached_tokens, TRUE);
2151 g_free(ucs4_phrase);
2152 return false;
2153 }
2154
2155 assert(cached_tokens->len == phrase_length);
2156
2157 bool result = _get_char_offset_recur
2158 (instance, cached_tokens, start, offset, &length);
2159
2160 g_array_free(cached_tokens, TRUE);
2161 g_free(ucs4_phrase);
2162
2163 *plength = length;
2164 return result;
2165 }
2166
2167 #if 0
2168 bool zhuyin_get_character_offset(zhuyin_instance_t * instance,
2169 size_t offset,
2170 size_t * plength) {
2171 zhuyin_context_t * context = instance->m_context;
2172 FacadePhraseIndex * phrase_index = context->m_phrase_index;
2173
2174 PhoneticKeyMatrix & matrix = instance->m_matrix;
2175 MatchResults results = instance->m_match_results;
2176 _check_offset(matrix, offset);
2177
2178 size_t length = 0;
2179 PhraseItem item;
2180 for (size_t i = 0; i < offset; ++i) {
2181 phrase_token_t token = g_array_index(results, phrase_token_t, i);
2182 if (null_token == token)
2183 continue;
2184
2185 int retval = phrase_index->get_phrase_item(token, item);
2186 assert(ERROR_OK == retval);
2187 guint8 len = item.get_phrase_length();
2188 length += len;
2189 }
2190
2191 *plength = length;
2192 return true;
2193 }
2194 #endif
2195
2196
zhuyin_get_n_phrase(zhuyin_instance_t * instance,guint * num)2197 bool zhuyin_get_n_phrase(zhuyin_instance_t * instance,
2198 guint * num) {
2199 *num = instance->m_phrase_result->len;
2200 return true;
2201 }
2202
zhuyin_get_phrase_token(zhuyin_instance_t * instance,guint index,phrase_token_t * token)2203 bool zhuyin_get_phrase_token(zhuyin_instance_t * instance,
2204 guint index,
2205 phrase_token_t * token){
2206 MatchResult & result = instance->m_phrase_result;
2207
2208 *token = null_token;
2209
2210 if (index >= result->len)
2211 return false;
2212
2213 *token = g_array_index(result, phrase_token_t, index);
2214
2215 return true;
2216 }
2217
2218 /**
2219 * Note: prefix is the text before the pre-edit string.
2220 */
2221