1 /** @file scim_pinyin_phrase.cpp
2 * implementation of PinyinPhrase, PinyinPhraseLib and related classes.
3 */
4
5 /*
6 * Smart Pinyin Input Method
7 *
8 * Copyright (c) 2005 James Su <suzhe@tsinghua.org.cn>
9 *
10 * $Id: scim_pinyin_phrase.cpp,v 1.3 2006/01/13 06:31:46 suzhe Exp $
11 *
12 */
13 #define Uses_STL_FUNCTIONAL
14 #define Uses_STL_VECTOR
15 #define Uses_STL_IOSTREAM
16 #define Uses_STL_FSTREAM
17 #define Uses_STL_ALGORITHM
18 #define Uses_STL_MAP
19 #define Uses_STL_UTILITY
20 #define Uses_STL_IOMANIP
21 #define Uses_C_STDIO
22 #define Uses_SCIM_UTILITY
23 #define Uses_SCIM_SERVER
24 #define Uses_SCIM_ICONV
25 #define Uses_SCIM_CONFIG_BASE
26 #define Uses_SCIM_CONFIG_PATH
27 #define Uses_SCIM_LOOKUP_TABLE
28
29 #include <cstring>
30
31 #include <scim.h>
32 #include "scim_pinyin_private.h"
33 #include "scim_phrase.h"
34 #include "scim_pinyin.h"
35 #include "scim_pinyin_phrase.h"
36
37 static const char scim_pinyin_lib_text_header [] = "SCIM_Pinyin_Library_TEXT";
38 static const char scim_pinyin_lib_binary_header [] = "SCIM_Pinyin_Library_BINARY";
39 static const char scim_pinyin_lib_version [] = "VERSION_0_1";
40
41 static const char scim_pinyin_phrase_idx_lib_text_header [] = "SCIM_Pinyin_Phrase_Index_Library_TEXT";
42 static const char scim_pinyin_phrase_idx_lib_binary_header [] = "SCIM_Pinyin_Phrase_Index_Library_BINARY";
43 static const char scim_pinyin_phrase_idx_lib_version [] = "VERSION_0_1";
44
45 bool
operator ()(const PinyinPhrase & lhs,const PinyinPhrase & rhs) const46 PinyinPhraseLessThan::operator () (const PinyinPhrase &lhs,
47 const PinyinPhrase &rhs) const
48 {
49 if (lhs.get_phrase () < rhs.get_phrase ()) return true;
50 else if (lhs.get_phrase () == rhs.get_phrase ()) {
51 for (unsigned int i=0; i<lhs.length(); i++) {
52 if (m_less (lhs.get_key (i), rhs.get_key (i))) return true;
53 else if (m_less (rhs.get_key (i), lhs.get_key (i))) return false;
54 }
55 }
56 return false;
57 }
58
59 bool
operator ()(const PinyinPhrase & lhs,const PinyinPhrase & rhs) const60 PinyinPhraseEqualTo::operator () (const PinyinPhrase &lhs,
61 const PinyinPhrase &rhs) const
62 {
63 if (lhs.get_lib () == rhs.get_lib ()&&
64 lhs.get_pinyin_offset () == rhs.get_pinyin_offset () &&
65 lhs.get_phrase_offset () == rhs.get_phrase_offset ())
66 return true;
67 else if (!(lhs.get_phrase () == rhs.get_phrase ())) return false;
68 else {
69 for (unsigned int i=0; i<lhs.length(); i++)
70 if (!m_equal (lhs.get_key (i), rhs.get_key (i))) return false;
71 }
72 return true;
73 }
74
75 bool
output_pinyin_lib(std::ostream & os,bool binary)76 PinyinPhraseLib::output_pinyin_lib (std::ostream &os, bool binary)
77 {
78 if (m_pinyin_lib.size () == 0) return false;
79
80 if (binary) {
81 unsigned char bytes [4];
82
83 os << scim_pinyin_lib_binary_header << "\n";
84 os << scim_pinyin_lib_version << "\n";
85
86 scim_uint32tobytes (bytes, m_pinyin_lib.size ());
87 os.write ((char*) bytes, sizeof (unsigned char) * 4);
88
89 for (PinyinKeyVector::iterator i=m_pinyin_lib.begin (); i!=m_pinyin_lib.end (); i++)
90 i->output_binary (os);
91
92 } else {
93 uint32 count = 0;
94 os << scim_pinyin_lib_text_header << "\n";
95 os << scim_pinyin_lib_version << "\n";
96 os << m_pinyin_lib.size () << "\n";
97
98 for (PinyinKeyVector::iterator i=m_pinyin_lib.begin (); i!=m_pinyin_lib.end (); i++) {
99 i->output_text (os);
100 os << " ";
101 count ++;
102 if (count == 32) {
103 os << "\n";
104 count = 0;
105 }
106 }
107 }
108 return true;
109 }
110
111 bool
input_pinyin_lib(const PinyinValidator & validator,std::istream & is)112 PinyinPhraseLib::input_pinyin_lib (const PinyinValidator &validator, std::istream &is)
113 {
114 if (!is) return false;
115
116 m_pinyin_lib.clear ();
117
118 char header [40];
119 bool binary;
120
121 //check header
122 is.getline (header, 40);
123 if (std::strncmp (header,
124 scim_pinyin_lib_text_header,
125 std::strlen (scim_pinyin_lib_text_header)) == 0) {
126 binary = false;
127 } else if (std::strncmp (header,
128 scim_pinyin_lib_binary_header,
129 std::strlen (scim_pinyin_lib_binary_header)) == 0) {
130 binary = true;
131 } else {
132 return false;
133 }
134
135 is.getline (header, 40);
136 if (std::strncmp (header, scim_pinyin_lib_version, std::strlen (scim_pinyin_lib_version)) != 0)
137 return false;
138
139 unsigned char bytes [4];
140 PinyinKey key;
141 uint32 number;
142
143 //get length
144 if (binary) {
145 is.read ((char*) bytes, sizeof(unsigned char) * 4);
146 number = scim_bytestouint32 (bytes);
147 } else {
148 is.getline (header, 40);
149 number = atoi (header);
150 }
151
152 if (number <= 0) return false;
153
154 m_pinyin_lib.reserve (number + 256);
155
156 if (binary) {
157 for (uint32 i=0; i<number; i++) {
158 key.input_binary (validator, is);
159 m_pinyin_lib.push_back (key);
160 }
161 } else {
162 for (uint32 i=0; i<number; i++) {
163 key.input_text (validator, is);
164 m_pinyin_lib.push_back (key);
165 }
166 }
167
168 return true;
169 }
170
171 //Pinyin Phrase Library
PinyinPhraseLib(const PinyinCustomSettings & custom,const PinyinValidator * validator,PinyinTable * pinyin_table,const char * libfile,const char * pylibfile,const char * idxfile)172 PinyinPhraseLib::PinyinPhraseLib (const PinyinCustomSettings &custom,
173 const PinyinValidator *validator,
174 PinyinTable *pinyin_table,
175 const char *libfile,
176 const char *pylibfile,
177 const char *idxfile)
178 : m_pinyin_table (pinyin_table),
179 m_validator (validator),
180 m_pinyin_key_less (custom),
181 m_pinyin_key_equal (custom),
182 m_pinyin_phrase_less_by_offset (this, custom),
183 m_pinyin_phrase_equal_by_offset (this, custom)
184 {
185 if (!m_validator) m_validator = PinyinValidator::get_default_pinyin_validator ();
186
187 load_lib (libfile, pylibfile, idxfile);
188 }
189
PinyinPhraseLib(const PinyinCustomSettings & custom,const PinyinValidator * validator,PinyinTable * pinyin_table,std::istream & is_lib,std::istream & is_pylib,std::istream & is_idx)190 PinyinPhraseLib::PinyinPhraseLib (const PinyinCustomSettings &custom,
191 const PinyinValidator *validator,
192 PinyinTable *pinyin_table,
193 std::istream &is_lib,
194 std::istream &is_pylib,
195 std::istream &is_idx)
196 : m_pinyin_table (pinyin_table),
197 m_validator (validator),
198 m_pinyin_key_less (custom),
199 m_pinyin_key_equal (custom),
200 m_pinyin_phrase_less_by_offset (this, custom),
201 m_pinyin_phrase_equal_by_offset (this, custom)
202 {
203 if (!m_validator) m_validator = PinyinValidator::get_default_pinyin_validator ();
204
205 input (is_lib, is_pylib, is_idx);
206 }
207
208 class __PinyinPhraseOutputIndexFuncBinary {
209 std::ostream &m_os;
210 public:
__PinyinPhraseOutputIndexFuncBinary(std::ostream & os)211 __PinyinPhraseOutputIndexFuncBinary (std::ostream &os) : m_os (os) { }
operator ()(const PinyinPhrase & phrase)212 void operator () (const PinyinPhrase & phrase) {
213 if (phrase.is_enable ()) {
214 unsigned char bytes [8];
215 scim_uint32tobytes (bytes, phrase.get_phrase_offset ());
216 scim_uint32tobytes (bytes+4, phrase.get_pinyin_offset ());
217 m_os.write ((char*) bytes, sizeof (unsigned char) * 8);
218 }
219 }
220 };
221
222 class __PinyinPhraseOutputIndexFuncText {
223 std::ostream &m_os;
224 public:
__PinyinPhraseOutputIndexFuncText(std::ostream & os)225 __PinyinPhraseOutputIndexFuncText (std::ostream &os) : m_os (os) { }
operator ()(const PinyinPhrase & phrase)226 void operator () (const PinyinPhrase & phrase) {
227 if (phrase.is_enable ()) {
228 m_os << phrase.get_phrase_offset () << " ";
229 m_os << phrase.get_pinyin_offset ();
230 m_os << "\n";
231 }
232 }
233 };
234
235 bool
output_indexes(std::ostream & os,bool binary)236 PinyinPhraseLib::output_indexes (std::ostream &os, bool binary)
237 {
238 uint32 phrase_number = count_phrase_number ();
239
240 if (binary) {
241 unsigned char bytes [4];
242 os << scim_pinyin_phrase_idx_lib_binary_header << "\n";
243 os << scim_pinyin_phrase_idx_lib_version << "\n";
244
245 scim_uint32tobytes (bytes, phrase_number);
246 os.write ((char*) bytes, sizeof (unsigned char) * 4);
247
248 __PinyinPhraseOutputIndexFuncBinary func(os);
249
250 for_each_phrase (func);
251 } else {
252 os << scim_pinyin_phrase_idx_lib_text_header << "\n";
253 os << scim_pinyin_phrase_idx_lib_version << "\n";
254 os << phrase_number << "\n";
255
256 __PinyinPhraseOutputIndexFuncText func(os);
257
258 for_each_phrase (func);
259 }
260 return true;
261 }
262
263 bool
input_indexes(std::istream & is)264 PinyinPhraseLib::input_indexes (std::istream &is)
265 {
266 char header [40];
267 bool binary = false;
268
269 if (!is) return false;
270
271 //check index file
272 is.getline (header, 40);
273 if (std::strncmp (header,
274 scim_pinyin_phrase_idx_lib_text_header,
275 std::strlen (scim_pinyin_phrase_idx_lib_text_header)) == 0) {
276 binary = false;
277 } else if (std::strncmp (header,
278 scim_pinyin_phrase_idx_lib_binary_header,
279 std::strlen (scim_pinyin_phrase_idx_lib_binary_header)) == 0) {
280 binary = true;
281 } else {
282 return false;
283 }
284
285 is.getline (header, 40);
286 if (std::strncmp (header, scim_pinyin_phrase_idx_lib_version,
287 std::strlen (scim_pinyin_phrase_idx_lib_version)) != 0)
288 return false;
289
290 unsigned char bytes [8];
291 uint32 number;
292
293 //get index number
294 if (binary) {
295 is.read ((char*) bytes, sizeof(unsigned char) * 4);
296 number = scim_bytestouint32 (bytes);
297 } else {
298 is.getline (header, 40);
299 number = atoi (header);
300 }
301
302 if (number == 0) return false;
303
304 clear_phrase_index ();
305
306 if (binary) {
307 for (uint32 i=0; i<number; i++) {
308 is.read ((char*) bytes, sizeof(unsigned char) * 8);
309
310 insert_pinyin_phrase_into_index (scim_bytestouint32 (bytes),
311 scim_bytestouint32 (bytes+4));
312 }
313 } else {
314 uint32 phrase_offset;
315 uint32 pinyin_offset;
316 for (uint32 i=0; i<number; i++) {
317 is >> phrase_offset;
318 is >> pinyin_offset;
319
320 insert_pinyin_phrase_into_index (phrase_offset, pinyin_offset);
321 }
322 }
323
324 sort_phrase_tables ();
325
326 return true;
327 }
328
329 bool
output(std::ostream & os_lib,std::ostream & os_pylib,std::ostream & os_idx,bool binary)330 PinyinPhraseLib::output (std::ostream &os_lib,
331 std::ostream &os_pylib,
332 std::ostream &os_idx,
333 bool binary)
334 {
335 bool ret = true;
336 if (!(os_lib || os_pylib || os_idx))
337 return false;
338
339 if (os_lib && !m_phrase_lib.output (os_lib, binary))
340 ret = false;
341
342 if (os_pylib && !output_pinyin_lib (os_pylib, binary))
343 ret = false;
344
345 if (os_idx && !output_indexes (os_idx, binary))
346 ret = false;
347
348 return ret;
349 }
350
351 bool
input(std::istream & is_lib,std::istream & is_pylib,std::istream & is_idx)352 PinyinPhraseLib::input (std::istream &is_lib,
353 std::istream &is_pylib,
354 std::istream &is_idx)
355 {
356 if (m_phrase_lib.input (is_lib)) {
357 if (is_idx && input_pinyin_lib (*m_validator, is_pylib)) {
358 if (!input_indexes (is_idx)) {
359 create_pinyin_index ();
360 return true;
361 }
362 } else {
363 create_pinyin_index ();
364 return true;
365 }
366 return true;
367 }
368 return false;
369 }
370
371 bool
input(std::istream & is_lib)372 PinyinPhraseLib::input (std::istream &is_lib)
373 {
374 if (m_phrase_lib.input (is_lib)) {
375 create_pinyin_index ();
376 return true;
377 }
378 return false;
379 }
380
381 bool
load_lib(const char * libfile,const char * pylibfile,const char * idxfile)382 PinyinPhraseLib::load_lib (const char *libfile,
383 const char *pylibfile,
384 const char *idxfile)
385 {
386 std::ifstream is_lib(libfile);
387 std::ifstream is_pylib (pylibfile);
388 std::ifstream is_idx (idxfile);
389 if (!is_lib) return false;
390 input (is_lib, is_pylib, is_idx);
391 compact_memory ();
392 return number_of_phrases () != 0;
393 }
394
395 bool
save_lib(const char * libfile,const char * pylibfile,const char * idxfile,bool binary)396 PinyinPhraseLib::save_lib (const char *libfile,
397 const char *pylibfile,
398 const char *idxfile,
399 bool binary)
400 {
401 std::ofstream os_lib(libfile);
402 std::ofstream os_pylib(pylibfile);
403 std::ofstream os_idx(idxfile);
404 return output (os_lib, os_pylib, os_idx, binary);
405 }
406
407 void
update_custom_settings(const PinyinCustomSettings & custom,const PinyinValidator * validator)408 PinyinPhraseLib::update_custom_settings (const PinyinCustomSettings &custom,
409 const PinyinValidator *validator)
410 {
411 m_pinyin_key_less = PinyinKeyLessThan (custom);
412 m_pinyin_key_equal = PinyinKeyEqualTo (custom);
413 m_pinyin_phrase_less_by_offset = PinyinPhraseLessThanByOffset (this, custom);
414 m_pinyin_phrase_equal_by_offset = PinyinPhraseEqualToByOffset (this, custom);
415
416 m_validator = validator;
417
418 if (!m_validator)
419 m_validator = PinyinValidator::get_default_pinyin_validator ();
420
421 sort_phrase_tables ();
422 }
423
424 int
find_phrases(PhraseVector & vec,const PinyinKeyVector & keys,bool noshorter,bool nolonger)425 PinyinPhraseLib::find_phrases (PhraseVector &vec,
426 const PinyinKeyVector &keys,
427 bool noshorter,
428 bool nolonger)
429 {
430 int minlen, maxlen;
431
432 if (noshorter) minlen = keys.size();
433 else minlen = 1;
434
435 if (nolonger) maxlen = keys.size();
436 else maxlen = -1;
437
438 return find_phrases (vec, keys.begin(), keys.end(), minlen, maxlen);
439 }
440
441 int
find_phrases(PhraseVector & vec,const PinyinParsedKeyVector & keys,bool noshorter,bool nolonger)442 PinyinPhraseLib::find_phrases (PhraseVector &vec,
443 const PinyinParsedKeyVector &keys,
444 bool noshorter,
445 bool nolonger)
446 {
447 int minlen, maxlen;
448
449 if (noshorter) minlen = keys.size();
450 else minlen = 1;
451
452 if (nolonger) maxlen = keys.size();
453 else maxlen = -1;
454
455 PinyinKeyVector nkeys;
456
457 for (PinyinParsedKeyVector::const_iterator i=keys.begin(); i!=keys.end(); i++)
458 nkeys.push_back (*i);
459
460 return find_phrases (vec, nkeys.begin(), nkeys.end(), minlen, maxlen);
461 }
462
463 int
find_phrases(PhraseVector & vec,const PinyinParsedKeyVector::const_iterator & begin,const PinyinParsedKeyVector::const_iterator & end,int minlen,int maxlen)464 PinyinPhraseLib::find_phrases (PhraseVector &vec,
465 const PinyinParsedKeyVector::const_iterator &begin,
466 const PinyinParsedKeyVector::const_iterator &end,
467 int minlen,
468 int maxlen)
469 {
470 PinyinKeyVector nkeys;
471
472 for (PinyinParsedKeyVector::const_iterator i=begin; i!=end; i++)
473 nkeys.push_back (*i);
474
475 return find_phrases (vec, nkeys.begin(), nkeys.end(), minlen, maxlen);
476 }
477
478 int
find_phrases(PhraseVector & vec,const char * keys,bool noshorter,bool nolonger)479 PinyinPhraseLib::find_phrases (PhraseVector &vec,
480 const char *keys,
481 bool noshorter,
482 bool nolonger)
483 {
484 PinyinParsedKeyVector pykeys;
485 PinyinDefaultParser parser;
486
487 parser.parse (*m_validator, pykeys, keys);
488
489 return find_phrases (vec, pykeys, noshorter, nolonger);
490 }
491
492 int
find_phrases(PhraseVector & vec,const PinyinKeyVector::const_iterator & begin,const PinyinKeyVector::const_iterator & end,int minlen,int maxlen)493 PinyinPhraseLib::find_phrases (PhraseVector &vec,
494 const PinyinKeyVector::const_iterator &begin,
495 const PinyinKeyVector::const_iterator &end,
496 int minlen,
497 int maxlen)
498 {
499 if (begin >= end) return 0;
500
501 minlen -= 1;
502 if (minlen < 0) minlen = 0;
503
504 if (maxlen <= 0) maxlen = SCIM_PHRASE_MAX_LENGTH;
505 else maxlen = std::min (maxlen, SCIM_PHRASE_MAX_LENGTH);
506
507 if (minlen >= maxlen) return 0;
508
509 std::pair<PinyinPhraseTable::iterator, PinyinPhraseTable::iterator> ptit;
510
511 for (int i=minlen; i<maxlen; i++) {
512 ptit = std::equal_range (m_phrases [i].begin (),
513 m_phrases [i].end (),
514 (*begin),
515 m_pinyin_key_less);
516
517 PinyinKeyVector::const_iterator pos = begin + (std::min ((int)(end-begin-1), i));
518
519 for (PinyinPhraseTable::iterator tit=ptit.first; tit!=ptit.second; tit++) {
520 find_phrases_impl (vec,
521 tit->get_vector ().begin(),
522 tit->get_vector ().end(),
523 begin,
524 pos,
525 end);
526 }
527 }
528
529 std::sort (vec.begin(), vec.end(), PhraseExactLessThan ());
530 vec.erase (std::unique (vec.begin(), vec.end(), PhraseExactEqualTo ()), vec.end());
531
532 return vec.size ();
533 }
534
535 void
find_phrases_impl(PhraseVector & pv,const PinyinPhraseOffsetVector::iterator & begin,const PinyinPhraseOffsetVector::iterator & end,const PinyinKeyVector::const_iterator & key_begin,const PinyinKeyVector::const_iterator & key_pos,const PinyinKeyVector::const_iterator & key_end)536 PinyinPhraseLib::find_phrases_impl (PhraseVector &pv,
537 const PinyinPhraseOffsetVector::iterator &begin,
538 const PinyinPhraseOffsetVector::iterator &end,
539 const PinyinKeyVector::const_iterator &key_begin,
540 const PinyinKeyVector::const_iterator &key_pos,
541 const PinyinKeyVector::const_iterator &key_end)
542 {
543 if (begin == end) return;
544
545 if (key_pos == key_begin) {
546 for (PinyinPhraseOffsetVector::iterator i=begin; i!=end; i++) {
547 if (valid_pinyin_phrase (i->first, i->second) &&
548 get_phrase (i->first).is_enable ())
549 pv.push_back (get_phrase (i->first));
550 }
551 return;
552 }
553
554 std::sort (begin, end, PinyinPhraseLessThanByOffsetSP (this, m_pinyin_key_less, key_pos-key_begin));
555
556 std::pair<PinyinPhraseOffsetVector::iterator, PinyinPhraseOffsetVector::iterator> it =
557 std::equal_range (begin, end, *key_pos,
558 PinyinPhraseLessThanByOffsetSP (this, m_pinyin_key_less, key_pos-key_begin));
559
560 return find_phrases_impl (pv, it.first, it.second, key_begin, key_pos-1, key_end);
561 }
562
563 Phrase
append(const Phrase & phrase,const PinyinKeyVector & keys)564 PinyinPhraseLib::append (const Phrase &phrase, const PinyinKeyVector &keys)
565 {
566 if (!phrase.valid () || !valid ())
567 return Phrase ();
568
569 Phrase tmp = m_phrase_lib.find (phrase);
570
571 if (tmp.valid () && tmp.is_enable ())
572 return tmp;
573
574 tmp = m_phrase_lib.append (phrase);
575
576 if (!tmp.valid () || !tmp.is_enable ())
577 return Phrase ();
578
579 insert_phrase_into_index (tmp, keys);
580 return tmp;
581 }
582
583 Phrase
append(const WideString & phrase,const PinyinKeyVector & keys)584 PinyinPhraseLib::append (const WideString &phrase, const PinyinKeyVector &keys)
585 {
586 if (phrase.length () == 0 || !valid ())
587 return Phrase ();
588
589 Phrase tmp = m_phrase_lib.find (phrase);
590
591 if (tmp.valid () && tmp.is_enable ())
592 return tmp;
593
594 tmp = m_phrase_lib.append (phrase);
595
596 if (!tmp.valid ())
597 return Phrase ();
598
599 insert_phrase_into_index (tmp, keys);
600 return tmp;
601 }
602
603 bool
insert_phrase_into_index(const Phrase & phrase,const PinyinKeyVector & keys)604 PinyinPhraseLib::insert_phrase_into_index (const Phrase &phrase, const PinyinKeyVector &keys)
605 {
606 if (!phrase.valid ()) return false;
607
608 // First find out all of the chars which have no valid key in keys.
609 WideString content = phrase.get_content ();
610 WideString nokey_content;
611
612 PinyinKeyVector final_keys;
613
614 std::vector<uint32> content_state;
615
616 std::vector<PinyinKeyVector> key_vv;
617
618 uint32 pinyin_offset = m_pinyin_lib.size ();
619
620 uint32 i,j,k;
621
622 for (i=0; i<content.length (); ++i) {
623 if (i < keys.size () &&
624 keys [i].get_initial () != SCIM_PINYIN_ZeroInitial &&
625 keys [i].get_final () != SCIM_PINYIN_ZeroFinal) {
626 //This key is valid, store it into final_key.
627 final_keys.push_back (keys [i]);
628 content_state.push_back (1);
629 } else {
630 //This key is invalid, put the content into nokey_content,
631 //and store a zero key into final_keys,
632 //and store the position into invalid_key_pos.
633 nokey_content.push_back (content [i]);
634 final_keys.push_back (PinyinKey ());
635 content_state.push_back (0);
636 }
637 }
638
639 if (nokey_content.length ())
640 m_pinyin_table->find_key_strings (key_vv, nokey_content);
641 else
642 key_vv.push_back (PinyinKeyVector ());
643
644 std::sort (m_phrases [content.length () -1].begin (),
645 m_phrases [content.length () -1].end (),
646 PinyinKeyExactLessThan ());
647
648 if (m_pinyin_lib.capacity () < m_pinyin_lib.size () + key_vv.size () * content.length ())
649 m_pinyin_lib.reserve (m_pinyin_lib.size () + key_vv.size () * content.length () + 1);
650
651 for (i=0; i<key_vv.size(); ++i) {
652 for (j=0, k=0; j<content.length (); ++j) {
653 if (content_state [j])
654 m_pinyin_lib.push_back (final_keys [j]);
655 else
656 m_pinyin_lib.push_back (key_vv [i][k++]);
657 }
658
659 insert_pinyin_phrase_into_index (phrase.get_phrase_offset (),
660 pinyin_offset);
661
662 pinyin_offset = m_pinyin_lib.size ();
663 }
664
665 std::sort (m_phrases [content.length () -1].begin (),
666 m_phrases [content.length () -1].end (), m_pinyin_key_less);
667
668 return true;
669 }
670
671 bool
insert_pinyin_phrase_into_index(uint32 phrase_index,uint32 pinyin_index)672 PinyinPhraseLib::insert_pinyin_phrase_into_index (uint32 phrase_index, uint32 pinyin_index)
673 {
674 if (!valid_pinyin_phrase (phrase_index, pinyin_index))
675 return false;
676
677 uint32 len = get_phrase (phrase_index).length();
678
679 if (len <= 0) return false;
680
681 PinyinKey key = get_pinyin_key (pinyin_index);
682
683 PinyinPhraseTable::iterator ptit=
684 std::lower_bound (m_phrases[len-1].begin (), m_phrases[len-1].end (), key, PinyinKeyExactLessThan ());
685
686 if (ptit != m_phrases[len-1].end () && PinyinKeyExactEqualTo () (*ptit,key)) {
687 ptit->get_vector ().push_back (PinyinPhraseOffsetPair (phrase_index, pinyin_index));
688 } else {
689 PinyinPhraseEntry entry (key);
690 entry.get_vector ().push_back (PinyinPhraseOffsetPair (phrase_index, pinyin_index));
691
692 if (ptit != m_phrases [len-1].end () &&
693 ptit >= m_phrases [len-1].begin () &&
694 m_phrases[len-1].size () > 0) {
695 m_phrases[len-1].insert (ptit, entry);
696 } else {
697 m_phrases[len-1].push_back (entry);
698 }
699 }
700 return true;
701 }
702
703 class __PinyinPhraseCountNumber
704 {
705 uint32 m_number;
706 public:
__PinyinPhraseCountNumber()707 __PinyinPhraseCountNumber () : m_number (0) { }
get_number()708 uint32 get_number () { return m_number; }
operator ()(const PinyinPhrase & phrase)709 void operator () (const PinyinPhrase &phrase) {
710 if (phrase.is_enable ())
711 m_number ++;
712 }
713 };
714
715 uint32
count_phrase_number()716 PinyinPhraseLib::count_phrase_number ()
717 {
718 __PinyinPhraseCountNumber counter;
719
720 for_each_phrase (counter);
721
722 return counter.get_number();
723 }
724
725 void
create_pinyin_index()726 PinyinPhraseLib::create_pinyin_index ()
727 {
728 if (!m_pinyin_table || !m_pinyin_table->size()) return;
729
730 clear_phrase_index ();
731
732 uint32 pinyin_offset = 0;
733
734 WideString content;
735 Phrase phrase;
736
737 for (uint32 i=0; i<m_phrase_lib.number_of_phrases (); i++) {
738 phrase = m_phrase_lib.get_phrase_by_index (i);
739
740 content = phrase.get_content ();
741
742 std::vector<PinyinKeyVector> key_vv;
743 m_pinyin_table->find_key_strings (key_vv, content);
744
745 for (uint32 j=0; j<key_vv.size(); j++) {
746 for (uint32 k=0; k<key_vv[j].size(); k++)
747 m_pinyin_lib.push_back (key_vv[j][k]);
748
749 insert_pinyin_phrase_into_index (phrase.get_phrase_offset (), pinyin_offset);
750
751 pinyin_offset = m_pinyin_lib.size ();
752 }
753 #if 0
754 if (key_vv.size () > 1 && content.length () > 1) {
755 for (uint32 x=0; x<key_vv.size (); x++) {
756 std::cerr << phrase.frequency () << "\t| " <<
757 utf8_wcstombs (content) << " =";
758 for (uint32 y=0; y<key_vv[x].size (); y++)
759 std::cerr << " " << key_vv[x][y];
760 std::cerr << "\n";
761 }
762 }
763 #endif
764 std::cout << "." << std::flush;
765 }
766
767 sort_phrase_tables ();
768
769 std::cout << "Phrase Number = " << count_phrase_number () << "\n";
770 }
771
772 void
sort_phrase_tables()773 PinyinPhraseLib::sort_phrase_tables ()
774 {
775 for (uint32 i=0; i<SCIM_PHRASE_MAX_LENGTH; i++) {
776 if (m_phrases [i].size ())
777 std::sort (m_phrases[i].begin (), m_phrases[i].end (), m_pinyin_key_less);
778 }
779 }
780
781 void
refine_phrase_index(PinyinPhraseValidatorFunc pinyin_phrase_validator)782 PinyinPhraseLib::refine_phrase_index (PinyinPhraseValidatorFunc pinyin_phrase_validator)
783 {
784 for (uint32 i=0; i<SCIM_PHRASE_MAX_LENGTH; i++) {
785 for (PinyinPhraseTable::iterator tit=m_phrases[i].begin(); tit!=m_phrases[i].end(); tit++) {
786 std::sort (tit->get_vector ().begin (),
787 tit->get_vector ().end (),
788 m_pinyin_phrase_less_by_offset);
789 tit->get_vector ().erase (
790 std::unique (tit->get_vector ().begin (),
791 tit->get_vector ().end (),
792 m_pinyin_phrase_equal_by_offset),
793 tit->get_vector ().end ());
794 if (pinyin_phrase_validator) {
795 PinyinPhraseOffsetVector tmp;
796 tmp.reserve (tit->get_vector ().size ());
797 for (PinyinPhraseOffsetVector::iterator vit=tit->get_vector ().begin ();
798 vit!=tit->get_vector ().end ();
799 vit++) {
800 if (pinyin_phrase_validator (PinyinPhrase (this, vit->first, vit->second)))
801 tmp.push_back (*vit);
802 }
803 tit->get_vector () = tmp;
804 }
805 }
806 }
807 }
808
809 void
refine_pinyin_lib()810 PinyinPhraseLib::refine_pinyin_lib ()
811 {
812 PinyinKeyVector tmp_pinyin_lib;
813
814 PinyinKeyVector::const_iterator result;
815 PinyinKeyVector::const_iterator vit_begin;
816 PinyinKeyVector::const_iterator vit_end;
817
818 uint32 len;
819 uint32 pinyin_offset;
820
821 tmp_pinyin_lib.reserve (m_pinyin_lib.size () + 1);
822
823 for (int i=SCIM_PHRASE_MAX_LENGTH-1; i>=0; i--) {
824 for (PinyinPhraseTable::iterator tit=m_phrases[i].begin(); tit!=m_phrases[i].end(); tit++) {
825 for (PinyinPhraseOffsetVector::iterator vit=tit->get_vector ().begin();
826 vit!=tit->get_vector ().end(); vit++) {
827 len = get_phrase (vit->first).length ();
828
829 if (len > 0) {
830 vit_begin = m_pinyin_lib.begin () + vit->second;
831 vit_end = vit_begin + len;
832
833 for (result = tmp_pinyin_lib.begin ();
834 result != tmp_pinyin_lib.end ();
835 result ++) {
836 uint32 j;
837 for (j=0; j< len && result + j < tmp_pinyin_lib.end (); j++) {
838 if (!m_pinyin_key_equal (*(result+j), *(vit_begin + j)))
839 break;
840 }
841 if (j == len)
842 break;
843 }
844
845 /*
846 result = std::find_end (tmp_pinyin_lib.begin (),
847 tmp_pinyin_lib.end (),
848 vit_begin,
849 vit_end,
850 m_pinyin_key_equal);
851 */
852
853 if (result != tmp_pinyin_lib.end ())
854 pinyin_offset = result - tmp_pinyin_lib.begin ();
855 else {
856 pinyin_offset = tmp_pinyin_lib.size ();
857 for (uint32 j=0; j<len; j++)
858 tmp_pinyin_lib.push_back (get_pinyin_key (vit->second + j));
859 }
860 vit->second = pinyin_offset;
861 }
862 std::cout << "." << std::flush;
863 }
864 }
865 }
866
867 std::cout << "\n";
868
869 m_pinyin_lib = tmp_pinyin_lib;
870 }
871
872 void
refine_library(PinyinPhraseValidatorFunc pinyin_phrase_validator)873 PinyinPhraseLib::refine_library (PinyinPhraseValidatorFunc pinyin_phrase_validator)
874 {
875 std::cout << "\n" << "refining phrase index." << "\n";
876 refine_phrase_index (pinyin_phrase_validator);
877 std::cout << "\n" << "refining pinyin lib." << "\n";
878 refine_pinyin_lib ();
879 }
880
881 void
clear_phrase_index()882 PinyinPhraseLib::clear_phrase_index ()
883 {
884 for (int i=0; i<SCIM_PHRASE_MAX_LENGTH; i++)
885 m_phrases [i].clear ();
886 }
887
888 void
compact_memory()889 PinyinPhraseLib::compact_memory ()
890 {
891 PinyinKeyVector (m_pinyin_lib).swap (m_pinyin_lib);
892
893 for (uint32 i=0; i<SCIM_PHRASE_MAX_LENGTH; i++) {
894 for (uint32 j=0; j<m_phrases [i].size (); j++)
895 (m_phrases [i])[j].compact_memory ();
896 }
897 }
898
899 void
dump_content(std::ostream & os,int minlen,int maxlen)900 PinyinPhraseLib::dump_content (std::ostream &os, int minlen, int maxlen)
901 {
902 PinyinPhraseLessThanByOffset less_op (this, m_pinyin_key_less);
903 if (minlen < 1) minlen = 1;
904 if (maxlen > SCIM_PHRASE_MAX_LENGTH) maxlen = SCIM_PHRASE_MAX_LENGTH;
905
906 for (int i = minlen; i <= maxlen; ++ i) {
907 PinyinPhraseOffsetVector offsets;
908 for (PinyinPhraseTable::iterator tit = m_phrases [i-1].begin (); tit != m_phrases [i-1].end (); ++ tit) {
909 PinyinPhraseOffsetVector::iterator begin = tit->get_vector ().begin ();
910 PinyinPhraseOffsetVector::iterator end = tit->get_vector ().end ();
911 offsets.insert (offsets.end (), begin, end);
912 }
913
914 std::sort (offsets.begin (), offsets.end (), less_op);
915
916 for (PinyinPhraseOffsetVector::iterator oit = offsets.begin (); oit != offsets.end (); ++ oit) {
917 bool before = false, after = false;
918
919 os << get_phrase (oit->first).frequency () << "\t";
920 if (oit > offsets.begin () && get_phrase ((oit-1)->first) == get_phrase (oit->first)) before = true;
921 if (oit < offsets.end () - 1 && get_phrase ((oit+1)->first) == get_phrase (oit->first)) after = true;
922 if (before || after) os << "+";
923 else os << "-";
924 os << utf8_wcstombs (get_phrase (oit->first).get_content ());
925 os << " =";
926 for (unsigned int j = 0; j < get_phrase (oit->first).length (); ++ j)
927 os << " " << get_pinyin_key (oit->second + j);
928 os << "\n";
929 }
930 }
931 }
932
933 void
optimize_phrase_frequencies(uint32 max_freq)934 PinyinPhraseLib::optimize_phrase_frequencies (uint32 max_freq)
935 {
936 uint32 freq = m_phrase_lib.get_max_phrase_frequency ();
937
938 if (freq < max_freq || !max_freq) return;
939
940 double ratio = ((double) max_freq) / freq;
941
942 Phrase phrase;
943
944 for (int i = 0; i<(int)m_phrase_lib.number_of_phrases (); ++i) {
945 phrase = m_phrase_lib.get_phrase_by_index (i);
946 phrase.set_frequency ((uint32)(phrase.frequency () * ratio));
947 }
948 }
949
950 /*
951 vi:ts=4:nowrap:ai
952 */
953