1 /** @file scim_pinyin_phrase.cpp
2  * implementation of PinyinPhrase, PinyinPhraseLib and related classes.
3  */
4 
5 /*
6  * Smart Pinyin Input Method
7  *
8  * Copyright (c) 2005 James Su <suzhe@tsinghua.org.cn>
9  *
10  * $Id: scim_pinyin_phrase.cpp,v 1.3 2006/01/13 06:31:46 suzhe Exp $
11  *
12  */
13 #define Uses_STL_FUNCTIONAL
14 #define Uses_STL_VECTOR
15 #define Uses_STL_IOSTREAM
16 #define Uses_STL_FSTREAM
17 #define Uses_STL_ALGORITHM
18 #define Uses_STL_MAP
19 #define Uses_STL_UTILITY
20 #define Uses_STL_IOMANIP
21 #define Uses_C_STDIO
22 #define Uses_SCIM_UTILITY
23 #define Uses_SCIM_SERVER
24 #define Uses_SCIM_ICONV
25 #define Uses_SCIM_CONFIG_BASE
26 #define Uses_SCIM_CONFIG_PATH
27 #define Uses_SCIM_LOOKUP_TABLE
28 
29 #include <cstring>
30 
31 #include <scim.h>
32 #include "scim_pinyin_private.h"
33 #include "scim_phrase.h"
34 #include "scim_pinyin.h"
35 #include "scim_pinyin_phrase.h"
36 
37 static const char scim_pinyin_lib_text_header [] = "SCIM_Pinyin_Library_TEXT";
38 static const char scim_pinyin_lib_binary_header [] = "SCIM_Pinyin_Library_BINARY";
39 static const char scim_pinyin_lib_version [] = "VERSION_0_1";
40 
41 static const char scim_pinyin_phrase_idx_lib_text_header [] = "SCIM_Pinyin_Phrase_Index_Library_TEXT";
42 static const char scim_pinyin_phrase_idx_lib_binary_header [] = "SCIM_Pinyin_Phrase_Index_Library_BINARY";
43 static const char scim_pinyin_phrase_idx_lib_version [] = "VERSION_0_1";
44 
45 bool
operator ()(const PinyinPhrase & lhs,const PinyinPhrase & rhs) const46 PinyinPhraseLessThan::operator () (const PinyinPhrase &lhs,
47 								   const PinyinPhrase &rhs) const
48 {
49 	if (lhs.get_phrase () < rhs.get_phrase ()) return true;
50 	else if (lhs.get_phrase () == rhs.get_phrase ()) {
51 		for (unsigned int i=0; i<lhs.length(); i++) {
52 			if (m_less (lhs.get_key (i), rhs.get_key (i))) return true;
53 			else if (m_less (rhs.get_key (i), lhs.get_key (i))) return false;
54 		}
55 	}
56 	return false;
57 }
58 
59 bool
operator ()(const PinyinPhrase & lhs,const PinyinPhrase & rhs) const60 PinyinPhraseEqualTo::operator () (const PinyinPhrase &lhs,
61 								  const PinyinPhrase &rhs) const
62 {
63 	if (lhs.get_lib () == rhs.get_lib ()&&
64 		 lhs.get_pinyin_offset () == rhs.get_pinyin_offset () &&
65 		 lhs.get_phrase_offset () == rhs.get_phrase_offset ())
66 		return true;
67 	else if (!(lhs.get_phrase () == rhs.get_phrase ())) return false;
68 	else {
69 		for (unsigned int i=0; i<lhs.length(); i++)
70 			if (!m_equal (lhs.get_key (i), rhs.get_key (i))) return false;
71 	}
72 	return true;
73 }
74 
75 bool
output_pinyin_lib(std::ostream & os,bool binary)76 PinyinPhraseLib::output_pinyin_lib (std::ostream &os, bool binary)
77 {
78 	if (m_pinyin_lib.size () == 0) return false;
79 
80 	if (binary) {
81 		unsigned char bytes [4];
82 
83 		os << scim_pinyin_lib_binary_header << "\n";
84 		os << scim_pinyin_lib_version << "\n";
85 
86 		scim_uint32tobytes (bytes, m_pinyin_lib.size ());
87 		os.write ((char*) bytes, sizeof (unsigned char) * 4);
88 
89 		for (PinyinKeyVector::iterator i=m_pinyin_lib.begin (); i!=m_pinyin_lib.end (); i++)
90 			i->output_binary (os);
91 
92 	} else {
93 		uint32 count = 0;
94 		os << scim_pinyin_lib_text_header << "\n";
95 		os << scim_pinyin_lib_version << "\n";
96 		os << m_pinyin_lib.size () << "\n";
97 
98 		for (PinyinKeyVector::iterator i=m_pinyin_lib.begin (); i!=m_pinyin_lib.end (); i++) {
99 			i->output_text (os);
100 			os << " ";
101 			count ++;
102 			if (count == 32) {
103 				os << "\n";
104 				count = 0;
105 			}
106 		}
107 	}
108 	return true;
109 }
110 
111 bool
input_pinyin_lib(const PinyinValidator & validator,std::istream & is)112 PinyinPhraseLib::input_pinyin_lib (const PinyinValidator &validator, std::istream &is)
113 {
114 	if (!is) return false;
115 
116 	m_pinyin_lib.clear ();
117 
118 	char header [40];
119 	bool binary;
120 
121 	//check header
122 	is.getline (header, 40);
123 	if (std::strncmp (header,
124 		scim_pinyin_lib_text_header,
125 		std::strlen (scim_pinyin_lib_text_header)) == 0) {
126 		binary = false;
127 	} else if (std::strncmp (header,
128 		scim_pinyin_lib_binary_header,
129 		std::strlen (scim_pinyin_lib_binary_header)) == 0) {
130 		binary = true;
131 	} else {
132 		return false;
133 	}
134 
135 	is.getline (header, 40);
136 	if (std::strncmp (header, scim_pinyin_lib_version, std::strlen (scim_pinyin_lib_version)) != 0)
137 		return false;
138 
139 	unsigned char bytes [4];
140 	PinyinKey key;
141 	uint32 number;
142 
143 	//get length
144 	if (binary) {
145 		is.read ((char*) bytes, sizeof(unsigned char) * 4);
146 		number = scim_bytestouint32 (bytes);
147 	} else {
148 		is.getline (header, 40);
149 		number = atoi (header);
150 	}
151 
152 	if (number <= 0) return false;
153 
154 	m_pinyin_lib.reserve (number + 256);
155 
156 	if (binary) {
157 		for (uint32 i=0; i<number; i++) {
158 			key.input_binary (validator, is);
159 			m_pinyin_lib.push_back (key);
160 		}
161 	} else {
162 		for (uint32 i=0; i<number; i++) {
163 			key.input_text (validator, is);
164 			m_pinyin_lib.push_back (key);
165 		}
166 	}
167 
168 	return true;
169 }
170 
171 //Pinyin Phrase Library
PinyinPhraseLib(const PinyinCustomSettings & custom,const PinyinValidator * validator,PinyinTable * pinyin_table,const char * libfile,const char * pylibfile,const char * idxfile)172 PinyinPhraseLib::PinyinPhraseLib (const PinyinCustomSettings &custom,
173 								  const PinyinValidator *validator,
174 				 				  PinyinTable *pinyin_table,
175 								  const char *libfile,
176 								  const char *pylibfile,
177 								  const char *idxfile)
178 	: m_pinyin_table (pinyin_table),
179 	  m_validator (validator),
180 	  m_pinyin_key_less (custom),
181 	  m_pinyin_key_equal (custom),
182 	  m_pinyin_phrase_less_by_offset (this, custom),
183 	  m_pinyin_phrase_equal_by_offset (this, custom)
184 {
185 	if (!m_validator) m_validator = PinyinValidator::get_default_pinyin_validator ();
186 
187 	load_lib (libfile, pylibfile, idxfile);
188 }
189 
PinyinPhraseLib(const PinyinCustomSettings & custom,const PinyinValidator * validator,PinyinTable * pinyin_table,std::istream & is_lib,std::istream & is_pylib,std::istream & is_idx)190 PinyinPhraseLib::PinyinPhraseLib (const PinyinCustomSettings &custom,
191 								  const PinyinValidator *validator,
192 				 				  PinyinTable *pinyin_table,
193 								  std::istream &is_lib,
194 								  std::istream &is_pylib,
195 								  std::istream &is_idx)
196 	: m_pinyin_table (pinyin_table),
197 	  m_validator (validator),
198 	  m_pinyin_key_less (custom),
199 	  m_pinyin_key_equal (custom),
200 	  m_pinyin_phrase_less_by_offset (this, custom),
201 	  m_pinyin_phrase_equal_by_offset (this, custom)
202 {
203 	if (!m_validator) m_validator = PinyinValidator::get_default_pinyin_validator ();
204 
205 	input (is_lib, is_pylib, is_idx);
206 }
207 
208 class __PinyinPhraseOutputIndexFuncBinary {
209 	std::ostream &m_os;
210 public:
__PinyinPhraseOutputIndexFuncBinary(std::ostream & os)211 	__PinyinPhraseOutputIndexFuncBinary (std::ostream &os) : m_os (os) { }
operator ()(const PinyinPhrase & phrase)212 	void operator () (const PinyinPhrase & phrase) {
213 		if (phrase.is_enable ()) {
214 			unsigned char bytes [8];
215 			scim_uint32tobytes (bytes, phrase.get_phrase_offset ());
216 			scim_uint32tobytes (bytes+4, phrase.get_pinyin_offset ());
217 			m_os.write ((char*) bytes, sizeof (unsigned char) * 8);
218 		}
219 	}
220 };
221 
222 class __PinyinPhraseOutputIndexFuncText {
223 	std::ostream &m_os;
224 public:
__PinyinPhraseOutputIndexFuncText(std::ostream & os)225 	__PinyinPhraseOutputIndexFuncText (std::ostream &os) : m_os (os) { }
operator ()(const PinyinPhrase & phrase)226 	void operator () (const PinyinPhrase & phrase) {
227 		if (phrase.is_enable ()) {
228 			m_os << phrase.get_phrase_offset () << " ";
229 			m_os << phrase.get_pinyin_offset ();
230 			m_os << "\n";
231 		}
232 	}
233 };
234 
235 bool
output_indexes(std::ostream & os,bool binary)236 PinyinPhraseLib::output_indexes (std::ostream &os, bool binary)
237 {
238 	uint32 phrase_number = count_phrase_number ();
239 
240 	if (binary) {
241 		unsigned char bytes [4];
242 		os << scim_pinyin_phrase_idx_lib_binary_header << "\n";
243 		os << scim_pinyin_phrase_idx_lib_version << "\n";
244 
245 		scim_uint32tobytes (bytes, phrase_number);
246 		os.write ((char*) bytes, sizeof (unsigned char) * 4);
247 
248 		__PinyinPhraseOutputIndexFuncBinary func(os);
249 
250 		for_each_phrase (func);
251 	} else {
252 		os << scim_pinyin_phrase_idx_lib_text_header << "\n";
253 		os << scim_pinyin_phrase_idx_lib_version << "\n";
254 		os << phrase_number << "\n";
255 
256 		__PinyinPhraseOutputIndexFuncText func(os);
257 
258 		for_each_phrase (func);
259 	}
260 	return true;
261 }
262 
263 bool
input_indexes(std::istream & is)264 PinyinPhraseLib::input_indexes (std::istream &is)
265 {
266 	char header [40];
267 	bool binary = false;
268 
269 	if (!is) return false;
270 
271 	//check index file
272 	is.getline (header, 40);
273 	if (std::strncmp (header,
274 		scim_pinyin_phrase_idx_lib_text_header,
275 		std::strlen (scim_pinyin_phrase_idx_lib_text_header)) == 0) {
276 		binary = false;
277 	} else if (std::strncmp (header,
278 		scim_pinyin_phrase_idx_lib_binary_header,
279 		std::strlen (scim_pinyin_phrase_idx_lib_binary_header)) == 0) {
280 		binary = true;
281 	} else {
282 		return false;
283 	}
284 
285 	is.getline (header, 40);
286 	if (std::strncmp (header, scim_pinyin_phrase_idx_lib_version,
287 					std::strlen (scim_pinyin_phrase_idx_lib_version)) != 0)
288 		return false;
289 
290 	unsigned char bytes [8];
291 	uint32 number;
292 
293 	//get index number
294 	if (binary) {
295 		is.read ((char*) bytes, sizeof(unsigned char) * 4);
296 		number = scim_bytestouint32 (bytes);
297 	} else {
298 		is.getline (header, 40);
299 		number = atoi (header);
300 	}
301 
302 	if (number == 0) return false;
303 
304 	clear_phrase_index ();
305 
306 	if (binary) {
307 		for (uint32 i=0; i<number; i++) {
308 			is.read ((char*) bytes, sizeof(unsigned char) * 8);
309 
310 			insert_pinyin_phrase_into_index (scim_bytestouint32 (bytes),
311 											  scim_bytestouint32 (bytes+4));
312 		}
313 	} else {
314 		uint32 phrase_offset;
315 		uint32 pinyin_offset;
316 		for (uint32 i=0; i<number; i++) {
317 			is >> phrase_offset;
318 			is >> pinyin_offset;
319 
320 			insert_pinyin_phrase_into_index (phrase_offset, pinyin_offset);
321 		}
322 	}
323 
324 	sort_phrase_tables ();
325 
326 	return true;
327 }
328 
329 bool
output(std::ostream & os_lib,std::ostream & os_pylib,std::ostream & os_idx,bool binary)330 PinyinPhraseLib::output (std::ostream &os_lib,
331 						 std::ostream &os_pylib,
332 						 std::ostream &os_idx,
333 						 bool binary)
334 {
335 	bool ret = true;
336 	if (!(os_lib || os_pylib || os_idx))
337 		return false;
338 
339 	if (os_lib && !m_phrase_lib.output (os_lib, binary))
340 		ret = false;
341 
342 	if (os_pylib && !output_pinyin_lib (os_pylib, binary))
343 		ret = false;
344 
345 	if (os_idx && !output_indexes (os_idx, binary))
346 		ret = false;
347 
348 	return ret;
349 }
350 
351 bool
input(std::istream & is_lib,std::istream & is_pylib,std::istream & is_idx)352 PinyinPhraseLib::input (std::istream &is_lib,
353 						std::istream &is_pylib,
354 						std::istream &is_idx)
355 {
356 	if (m_phrase_lib.input (is_lib)) {
357 		if (is_idx && input_pinyin_lib (*m_validator, is_pylib)) {
358 			if (!input_indexes (is_idx)) {
359 				create_pinyin_index ();
360 				return true;
361 			}
362 		} else {
363 			create_pinyin_index ();
364 			return true;
365 		}
366 		return true;
367 	}
368 	return false;
369 }
370 
371 bool
input(std::istream & is_lib)372 PinyinPhraseLib::input (std::istream &is_lib)
373 {
374 	if (m_phrase_lib.input (is_lib)) {
375 		create_pinyin_index ();
376 		return true;
377 	}
378 	return false;
379 }
380 
381 bool
load_lib(const char * libfile,const char * pylibfile,const char * idxfile)382 PinyinPhraseLib::load_lib (const char *libfile,
383 						   const char *pylibfile,
384 						   const char *idxfile)
385 {
386 	std::ifstream is_lib(libfile);
387 	std::ifstream is_pylib (pylibfile);
388 	std::ifstream is_idx (idxfile);
389 	if (!is_lib) return false;
390 	input (is_lib, is_pylib, is_idx);
391 	compact_memory ();
392 	return number_of_phrases () != 0;
393 }
394 
395 bool
save_lib(const char * libfile,const char * pylibfile,const char * idxfile,bool binary)396 PinyinPhraseLib::save_lib (const char *libfile,
397 						   const char *pylibfile,
398 						   const char *idxfile,
399 						   bool binary)
400 {
401 	std::ofstream os_lib(libfile);
402 	std::ofstream os_pylib(pylibfile);
403 	std::ofstream os_idx(idxfile);
404 	return output (os_lib, os_pylib, os_idx, binary);
405 }
406 
407 void
update_custom_settings(const PinyinCustomSettings & custom,const PinyinValidator * validator)408 PinyinPhraseLib::update_custom_settings (const PinyinCustomSettings &custom,
409 										 const PinyinValidator *validator)
410 {
411 	m_pinyin_key_less  = PinyinKeyLessThan (custom);
412 	m_pinyin_key_equal = PinyinKeyEqualTo (custom);
413 	m_pinyin_phrase_less_by_offset  = PinyinPhraseLessThanByOffset (this, custom);
414 	m_pinyin_phrase_equal_by_offset = PinyinPhraseEqualToByOffset (this, custom);
415 
416 	m_validator = validator;
417 
418 	if (!m_validator)
419 		m_validator = PinyinValidator::get_default_pinyin_validator ();
420 
421 	sort_phrase_tables ();
422 }
423 
424 int
find_phrases(PhraseVector & vec,const PinyinKeyVector & keys,bool noshorter,bool nolonger)425 PinyinPhraseLib::find_phrases (PhraseVector &vec,
426 							   const PinyinKeyVector &keys,
427 							   bool noshorter,
428 							   bool nolonger)
429 {
430 	int minlen, maxlen;
431 
432 	if (noshorter) minlen = keys.size();
433 	else minlen = 1;
434 
435 	if (nolonger) maxlen = keys.size();
436 	else maxlen = -1;
437 
438 	return find_phrases (vec, keys.begin(), keys.end(), minlen, maxlen);
439 }
440 
441 int
find_phrases(PhraseVector & vec,const PinyinParsedKeyVector & keys,bool noshorter,bool nolonger)442 PinyinPhraseLib::find_phrases (PhraseVector &vec,
443 							   const PinyinParsedKeyVector &keys,
444 							   bool noshorter,
445 							   bool nolonger)
446 {
447 	int minlen, maxlen;
448 
449 	if (noshorter) minlen = keys.size();
450 	else minlen = 1;
451 
452 	if (nolonger) maxlen = keys.size();
453 	else maxlen = -1;
454 
455 	PinyinKeyVector nkeys;
456 
457 	for (PinyinParsedKeyVector::const_iterator i=keys.begin(); i!=keys.end(); i++)
458 		nkeys.push_back (*i);
459 
460 	return find_phrases (vec, nkeys.begin(), nkeys.end(), minlen, maxlen);
461 }
462 
463 int
find_phrases(PhraseVector & vec,const PinyinParsedKeyVector::const_iterator & begin,const PinyinParsedKeyVector::const_iterator & end,int minlen,int maxlen)464 PinyinPhraseLib::find_phrases (PhraseVector &vec,
465 							   const PinyinParsedKeyVector::const_iterator &begin,
466 							   const PinyinParsedKeyVector::const_iterator &end,
467 							   int minlen,
468 							   int maxlen)
469 {
470 	PinyinKeyVector nkeys;
471 
472 	for (PinyinParsedKeyVector::const_iterator i=begin; i!=end; i++)
473 		nkeys.push_back (*i);
474 
475 	return find_phrases (vec, nkeys.begin(), nkeys.end(), minlen, maxlen);
476 }
477 
478 int
find_phrases(PhraseVector & vec,const char * keys,bool noshorter,bool nolonger)479 PinyinPhraseLib::find_phrases (PhraseVector &vec,
480 							   const char *keys,
481 							   bool noshorter,
482 							   bool nolonger)
483 {
484 	PinyinParsedKeyVector pykeys;
485 	PinyinDefaultParser parser;
486 
487 	parser.parse (*m_validator, pykeys, keys);
488 
489 	return find_phrases (vec, pykeys, noshorter, nolonger);
490 }
491 
492 int
find_phrases(PhraseVector & vec,const PinyinKeyVector::const_iterator & begin,const PinyinKeyVector::const_iterator & end,int minlen,int maxlen)493 PinyinPhraseLib::find_phrases (PhraseVector &vec,
494 							   const PinyinKeyVector::const_iterator &begin,
495 							   const PinyinKeyVector::const_iterator &end,
496 							   int minlen,
497 							   int maxlen)
498 {
499 	if (begin >= end) return 0;
500 
501 	minlen -= 1;
502 	if (minlen < 0) minlen = 0;
503 
504 	if (maxlen <= 0) maxlen = SCIM_PHRASE_MAX_LENGTH;
505 	else maxlen = std::min (maxlen, SCIM_PHRASE_MAX_LENGTH);
506 
507 	if (minlen >= maxlen) return 0;
508 
509 	std::pair<PinyinPhraseTable::iterator, PinyinPhraseTable::iterator> ptit;
510 
511 	for (int i=minlen; i<maxlen; i++) {
512 		ptit = std::equal_range (m_phrases [i].begin (),
513 								 m_phrases [i].end (),
514 								 (*begin),
515 								 m_pinyin_key_less);
516 
517 		PinyinKeyVector::const_iterator pos = begin + (std::min ((int)(end-begin-1), i));
518 
519 		for (PinyinPhraseTable::iterator tit=ptit.first; tit!=ptit.second; tit++) {
520 			find_phrases_impl (vec,
521 							 tit->get_vector ().begin(),
522 							 tit->get_vector ().end(),
523 							 begin,
524 							 pos,
525 							 end);
526 		}
527 	}
528 
529 	std::sort (vec.begin(), vec.end(), PhraseExactLessThan ());
530 	vec.erase (std::unique (vec.begin(), vec.end(), PhraseExactEqualTo ()), vec.end());
531 
532 	return vec.size ();
533 }
534 
535 void
find_phrases_impl(PhraseVector & pv,const PinyinPhraseOffsetVector::iterator & begin,const PinyinPhraseOffsetVector::iterator & end,const PinyinKeyVector::const_iterator & key_begin,const PinyinKeyVector::const_iterator & key_pos,const PinyinKeyVector::const_iterator & key_end)536 PinyinPhraseLib::find_phrases_impl (PhraseVector &pv,
537 									const PinyinPhraseOffsetVector::iterator &begin,
538 									const PinyinPhraseOffsetVector::iterator &end,
539 									const PinyinKeyVector::const_iterator &key_begin,
540 									const PinyinKeyVector::const_iterator &key_pos,
541 									const PinyinKeyVector::const_iterator &key_end)
542 {
543 	if (begin == end) return;
544 
545 	if (key_pos == key_begin) {
546 		for (PinyinPhraseOffsetVector::iterator i=begin; i!=end; i++) {
547 			if (valid_pinyin_phrase (i->first, i->second) &&
548 				get_phrase (i->first).is_enable ())
549 				pv.push_back (get_phrase (i->first));
550 		}
551 		return;
552 	}
553 
554 	std::sort (begin, end, PinyinPhraseLessThanByOffsetSP (this, m_pinyin_key_less, key_pos-key_begin));
555 
556 	std::pair<PinyinPhraseOffsetVector::iterator, PinyinPhraseOffsetVector::iterator> it =
557 		std::equal_range (begin, end, *key_pos,
558 						  PinyinPhraseLessThanByOffsetSP (this, m_pinyin_key_less, key_pos-key_begin));
559 
560 	return find_phrases_impl (pv, it.first, it.second, key_begin, key_pos-1, key_end);
561 }
562 
563 Phrase
append(const Phrase & phrase,const PinyinKeyVector & keys)564 PinyinPhraseLib::append (const Phrase &phrase, const PinyinKeyVector &keys)
565 {
566 	if (!phrase.valid () || !valid ())
567 		return Phrase ();
568 
569 	Phrase tmp = m_phrase_lib.find (phrase);
570 
571 	if (tmp.valid () && tmp.is_enable ())
572 		return tmp;
573 
574 	tmp = m_phrase_lib.append (phrase);
575 
576 	if (!tmp.valid () || !tmp.is_enable ())
577 		return Phrase ();
578 
579 	insert_phrase_into_index (tmp, keys);
580 	return tmp;
581 }
582 
583 Phrase
append(const WideString & phrase,const PinyinKeyVector & keys)584 PinyinPhraseLib::append (const WideString &phrase, const PinyinKeyVector &keys)
585 {
586 	if (phrase.length () == 0 || !valid ())
587 		return Phrase ();
588 
589 	Phrase tmp = m_phrase_lib.find (phrase);
590 
591 	if (tmp.valid () && tmp.is_enable ())
592 		return tmp;
593 
594 	tmp = m_phrase_lib.append (phrase);
595 
596 	if (!tmp.valid ())
597 		return Phrase ();
598 
599 	insert_phrase_into_index (tmp, keys);
600 	return tmp;
601 }
602 
603 bool
insert_phrase_into_index(const Phrase & phrase,const PinyinKeyVector & keys)604 PinyinPhraseLib::insert_phrase_into_index (const Phrase &phrase, const PinyinKeyVector &keys)
605 {
606 	if (!phrase.valid ()) return false;
607 
608 	// First find out all of the chars which have no valid key in keys.
609 	WideString content = phrase.get_content ();
610 	WideString nokey_content;
611 
612 	PinyinKeyVector final_keys;
613 
614 	std::vector<uint32> content_state;
615 
616 	std::vector<PinyinKeyVector> key_vv;
617 
618 	uint32 pinyin_offset = m_pinyin_lib.size ();
619 
620 	uint32 i,j,k;
621 
622 	for (i=0; i<content.length (); ++i) {
623 		if (i < keys.size () &&
624 			keys [i].get_initial () != SCIM_PINYIN_ZeroInitial &&
625 			keys [i].get_final () != SCIM_PINYIN_ZeroFinal) {
626 			//This key is valid, store it into final_key.
627 			final_keys.push_back (keys [i]);
628 			content_state.push_back (1);
629 		} else {
630 			//This key is invalid, put the content into nokey_content,
631 			//and store a zero key into final_keys,
632 			//and store the position into invalid_key_pos.
633 			nokey_content.push_back (content [i]);
634 			final_keys.push_back (PinyinKey ());
635 			content_state.push_back (0);
636 		}
637 	}
638 
639 	if (nokey_content.length ())
640 		m_pinyin_table->find_key_strings (key_vv, nokey_content);
641 	else
642 		key_vv.push_back (PinyinKeyVector ());
643 
644 	std::sort (m_phrases [content.length () -1].begin (),
645 			   m_phrases [content.length () -1].end (),
646 			   PinyinKeyExactLessThan ());
647 
648 	if (m_pinyin_lib.capacity () < m_pinyin_lib.size () + key_vv.size () * content.length ())
649 		m_pinyin_lib.reserve (m_pinyin_lib.size () + key_vv.size () * content.length () + 1);
650 
651 	for (i=0; i<key_vv.size(); ++i) {
652 		for (j=0, k=0; j<content.length (); ++j) {
653 			if (content_state [j])
654 				m_pinyin_lib.push_back (final_keys [j]);
655 			else
656 				m_pinyin_lib.push_back (key_vv [i][k++]);
657 		}
658 
659 		insert_pinyin_phrase_into_index (phrase.get_phrase_offset (),
660 										 pinyin_offset);
661 
662 		pinyin_offset = m_pinyin_lib.size ();
663 	}
664 
665 	std::sort (m_phrases [content.length () -1].begin (),
666 			   m_phrases [content.length () -1].end (), m_pinyin_key_less);
667 
668 	return true;
669 }
670 
671 bool
insert_pinyin_phrase_into_index(uint32 phrase_index,uint32 pinyin_index)672 PinyinPhraseLib::insert_pinyin_phrase_into_index (uint32 phrase_index, uint32 pinyin_index)
673 {
674 	if (!valid_pinyin_phrase (phrase_index, pinyin_index))
675 		return false;
676 
677 	uint32 len = get_phrase (phrase_index).length();
678 
679 	if (len <= 0) return false;
680 
681 	PinyinKey key = get_pinyin_key (pinyin_index);
682 
683 	PinyinPhraseTable::iterator ptit=
684 		std::lower_bound (m_phrases[len-1].begin (), m_phrases[len-1].end (), key, PinyinKeyExactLessThan ());
685 
686 	if (ptit != m_phrases[len-1].end () && PinyinKeyExactEqualTo () (*ptit,key)) {
687 		ptit->get_vector ().push_back (PinyinPhraseOffsetPair (phrase_index, pinyin_index));
688 	} else {
689 		PinyinPhraseEntry entry (key);
690 		entry.get_vector ().push_back (PinyinPhraseOffsetPair (phrase_index, pinyin_index));
691 
692 		if (ptit != m_phrases [len-1].end () &&
693 			ptit >= m_phrases [len-1].begin () &&
694 			m_phrases[len-1].size () > 0) {
695 			m_phrases[len-1].insert (ptit, entry);
696 		} else {
697 			m_phrases[len-1].push_back (entry);
698 		}
699 	}
700 	return true;
701 }
702 
703 class __PinyinPhraseCountNumber
704 {
705 	uint32 m_number;
706 public:
__PinyinPhraseCountNumber()707 	__PinyinPhraseCountNumber () : m_number (0) { }
get_number()708 	uint32 get_number () { return m_number; }
operator ()(const PinyinPhrase & phrase)709 	void operator () (const PinyinPhrase &phrase) {
710 		if (phrase.is_enable ())
711 			m_number ++;
712 	}
713 };
714 
715 uint32
count_phrase_number()716 PinyinPhraseLib::count_phrase_number ()
717 {
718 	__PinyinPhraseCountNumber counter;
719 
720 	for_each_phrase (counter);
721 
722 	return counter.get_number();
723 }
724 
725 void
create_pinyin_index()726 PinyinPhraseLib::create_pinyin_index ()
727 {
728 	if (!m_pinyin_table || !m_pinyin_table->size()) return;
729 
730 	clear_phrase_index ();
731 
732 	uint32 pinyin_offset = 0;
733 
734 	WideString content;
735 	Phrase phrase;
736 
737 	for (uint32 i=0; i<m_phrase_lib.number_of_phrases (); i++) {
738 		phrase = m_phrase_lib.get_phrase_by_index (i);
739 
740 		content = phrase.get_content ();
741 
742 		std::vector<PinyinKeyVector> key_vv;
743 		m_pinyin_table->find_key_strings (key_vv, content);
744 
745 		for (uint32 j=0; j<key_vv.size(); j++) {
746 			for (uint32 k=0; k<key_vv[j].size(); k++)
747 				m_pinyin_lib.push_back (key_vv[j][k]);
748 
749 			insert_pinyin_phrase_into_index (phrase.get_phrase_offset (), pinyin_offset);
750 
751 			pinyin_offset = m_pinyin_lib.size ();
752 		}
753 #if 0
754 		if (key_vv.size () > 1 && content.length () > 1) {
755 			for (uint32 x=0; x<key_vv.size (); x++) {
756 				std::cerr << phrase.frequency () << "\t| " <<
757 						utf8_wcstombs (content) << " =";
758 				for (uint32 y=0; y<key_vv[x].size (); y++)
759 					std::cerr << " " << key_vv[x][y];
760 				std::cerr << "\n";
761 			}
762 		}
763 #endif
764 		std::cout << "." << std::flush;
765 	}
766 
767 	sort_phrase_tables ();
768 
769 	std::cout << "Phrase Number = " << count_phrase_number () << "\n";
770 }
771 
772 void
sort_phrase_tables()773 PinyinPhraseLib::sort_phrase_tables ()
774 {
775 	for (uint32 i=0; i<SCIM_PHRASE_MAX_LENGTH; i++) {
776 		if (m_phrases [i].size ())
777 			std::sort (m_phrases[i].begin (), m_phrases[i].end (), m_pinyin_key_less);
778 	}
779 }
780 
781 void
refine_phrase_index(PinyinPhraseValidatorFunc pinyin_phrase_validator)782 PinyinPhraseLib::refine_phrase_index (PinyinPhraseValidatorFunc pinyin_phrase_validator)
783 {
784 	for (uint32 i=0; i<SCIM_PHRASE_MAX_LENGTH; i++) {
785 		for (PinyinPhraseTable::iterator tit=m_phrases[i].begin(); tit!=m_phrases[i].end(); tit++) {
786 			std::sort (tit->get_vector ().begin (),
787 					   tit->get_vector ().end (),
788 					   m_pinyin_phrase_less_by_offset);
789 			tit->get_vector ().erase (
790 							std::unique (tit->get_vector ().begin (),
791 										 tit->get_vector ().end (),
792 										 m_pinyin_phrase_equal_by_offset),
793 							tit->get_vector ().end ());
794 			if (pinyin_phrase_validator) {
795 				PinyinPhraseOffsetVector tmp;
796 				tmp.reserve (tit->get_vector ().size ());
797 				for (PinyinPhraseOffsetVector::iterator vit=tit->get_vector ().begin ();
798 														vit!=tit->get_vector ().end ();
799 														vit++) {
800 					if (pinyin_phrase_validator (PinyinPhrase (this, vit->first, vit->second)))
801 						tmp.push_back (*vit);
802 				}
803 				tit->get_vector () = tmp;
804 			}
805 		}
806 	}
807 }
808 
809 void
refine_pinyin_lib()810 PinyinPhraseLib::refine_pinyin_lib ()
811 {
812 	PinyinKeyVector tmp_pinyin_lib;
813 
814 	PinyinKeyVector::const_iterator result;
815 	PinyinKeyVector::const_iterator vit_begin;
816 	PinyinKeyVector::const_iterator vit_end;
817 
818 	uint32 len;
819 	uint32 pinyin_offset;
820 
821 	tmp_pinyin_lib.reserve (m_pinyin_lib.size () + 1);
822 
823 	for (int i=SCIM_PHRASE_MAX_LENGTH-1; i>=0; i--) {
824 		for (PinyinPhraseTable::iterator tit=m_phrases[i].begin(); tit!=m_phrases[i].end(); tit++) {
825 			for (PinyinPhraseOffsetVector::iterator vit=tit->get_vector ().begin();
826 					vit!=tit->get_vector ().end(); vit++) {
827 				len = get_phrase (vit->first).length ();
828 
829 				if (len > 0) {
830 					vit_begin = m_pinyin_lib.begin () + vit->second;
831 					vit_end   = vit_begin + len;
832 
833 					for (result  = tmp_pinyin_lib.begin ();
834 						 result != tmp_pinyin_lib.end ();
835 						 result ++) {
836 						uint32 j;
837 						for (j=0; j< len && result + j < tmp_pinyin_lib.end (); j++) {
838 							if (!m_pinyin_key_equal (*(result+j), *(vit_begin + j)))
839 								break;
840 						}
841 						if (j == len)
842 							break;
843 					}
844 
845 					/*
846 					result = std::find_end (tmp_pinyin_lib.begin (),
847 										  tmp_pinyin_lib.end (),
848 										  vit_begin,
849 										  vit_end,
850 										  m_pinyin_key_equal);
851 					*/
852 
853 					if (result != tmp_pinyin_lib.end ())
854 						pinyin_offset = result - tmp_pinyin_lib.begin ();
855 					else {
856 						pinyin_offset = tmp_pinyin_lib.size ();
857 						for (uint32 j=0; j<len; j++)
858 							tmp_pinyin_lib.push_back (get_pinyin_key (vit->second + j));
859 					}
860 					vit->second = pinyin_offset;
861 				}
862 				std::cout << "." << std::flush;
863 			}
864 		}
865 	}
866 
867 	std::cout << "\n";
868 
869 	m_pinyin_lib = tmp_pinyin_lib;
870 }
871 
872 void
refine_library(PinyinPhraseValidatorFunc pinyin_phrase_validator)873 PinyinPhraseLib::refine_library (PinyinPhraseValidatorFunc pinyin_phrase_validator)
874 {
875 	std::cout << "\n" << "refining phrase index." << "\n";
876 	refine_phrase_index (pinyin_phrase_validator);
877 	std::cout << "\n" << "refining pinyin lib." << "\n";
878 	refine_pinyin_lib ();
879 }
880 
881 void
clear_phrase_index()882 PinyinPhraseLib::clear_phrase_index ()
883 {
884 	for (int i=0; i<SCIM_PHRASE_MAX_LENGTH; i++)
885 		m_phrases [i].clear ();
886 }
887 
888 void
compact_memory()889 PinyinPhraseLib::compact_memory ()
890 {
891 	PinyinKeyVector (m_pinyin_lib).swap (m_pinyin_lib);
892 
893 	for (uint32 i=0; i<SCIM_PHRASE_MAX_LENGTH; i++) {
894 		for (uint32 j=0; j<m_phrases [i].size (); j++)
895 			(m_phrases [i])[j].compact_memory ();
896 	}
897 }
898 
899 void
dump_content(std::ostream & os,int minlen,int maxlen)900 PinyinPhraseLib::dump_content (std::ostream &os, int minlen, int maxlen)
901 {
902 	PinyinPhraseLessThanByOffset less_op (this, m_pinyin_key_less);
903 	if (minlen < 1) minlen = 1;
904 	if (maxlen > SCIM_PHRASE_MAX_LENGTH) maxlen = SCIM_PHRASE_MAX_LENGTH;
905 
906 	for (int i = minlen; i <= maxlen; ++ i) {
907 		PinyinPhraseOffsetVector offsets;
908 		for (PinyinPhraseTable::iterator tit = m_phrases [i-1].begin (); tit != m_phrases [i-1].end (); ++ tit) {
909 			PinyinPhraseOffsetVector::iterator begin = tit->get_vector ().begin ();
910 			PinyinPhraseOffsetVector::iterator end = tit->get_vector ().end ();
911 			offsets.insert (offsets.end (), begin, end);
912 		}
913 
914 		std::sort (offsets.begin (), offsets.end (), less_op);
915 
916 		for (PinyinPhraseOffsetVector::iterator oit = offsets.begin (); oit != offsets.end (); ++ oit) {
917 			bool before = false, after = false;
918 
919 			os << get_phrase (oit->first).frequency () << "\t";
920 			if (oit > offsets.begin () && get_phrase ((oit-1)->first) == get_phrase (oit->first)) before = true;
921 			if (oit < offsets.end () - 1 && get_phrase ((oit+1)->first) == get_phrase (oit->first)) after = true;
922 			if (before || after) os << "+";
923 			else os << "-";
924 			os << utf8_wcstombs (get_phrase (oit->first).get_content ());
925 			os << " =";
926 			for (unsigned int j = 0; j < get_phrase (oit->first).length (); ++ j)
927 				os << " " << get_pinyin_key (oit->second + j);
928 			os << "\n";
929 		}
930 	}
931 }
932 
933 void
optimize_phrase_frequencies(uint32 max_freq)934 PinyinPhraseLib::optimize_phrase_frequencies (uint32 max_freq)
935 {
936 	uint32 freq = m_phrase_lib.get_max_phrase_frequency ();
937 
938 	if (freq < max_freq || !max_freq) return;
939 
940 	double ratio = ((double) max_freq) / freq;
941 
942 	Phrase phrase;
943 
944 	for (int i = 0; i<(int)m_phrase_lib.number_of_phrases (); ++i) {
945 		phrase = m_phrase_lib.get_phrase_by_index (i);
946 		phrase.set_frequency ((uint32)(phrase.frequency () * ratio));
947 	}
948 }
949 
950 /*
951 vi:ts=4:nowrap:ai
952 */
953