1 /*
2 * libpinyin
3 * Library to deal with pinyin.
4 *
5 * Copyright (C) 2010 Peng Wu
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <math.h>
22 #include "stl_lite.h"
23 #include "novel_types.h"
24 #include "phrase_index.h"
25 #include "facade_phrase_table3.h"
26 #include "phrase_lookup.h"
27
28 using namespace pinyin;
29
30
31 /*
32 const gfloat PhraseLookup::bigram_lambda = lambda;
33 const gfloat PhraseLookup::unigram_lambda = 1 - lambda;
34 */
35
populate_prefixes(GPtrArray * steps_index,GPtrArray * steps_content)36 static bool populate_prefixes(GPtrArray * steps_index,
37 GPtrArray * steps_content) {
38
39 lookup_key_t initial_key = sentence_start;
40 lookup_value_t initial_value(log(1.f));
41 initial_value.m_handles[1] = sentence_start;
42
43 LookupStepContent initial_step_content = (LookupStepContent)
44 g_ptr_array_index(steps_content, 0);
45 g_array_append_val(initial_step_content, initial_value);
46
47 LookupStepIndex initial_step_index = (LookupStepIndex)
48 g_ptr_array_index(steps_index, 0);
49 g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key),
50 GUINT_TO_POINTER(initial_step_content->len - 1));
51
52 return true;
53 }
54
init_steps(GPtrArray * steps_index,GPtrArray * steps_content,int nstep)55 static bool init_steps(GPtrArray * steps_index,
56 GPtrArray * steps_content,
57 int nstep) {
58
59 /* add null start step */
60 g_ptr_array_set_size(steps_index, nstep);
61 g_ptr_array_set_size(steps_content, nstep);
62
63 for ( int i = 0; i < nstep; ++i ){
64 /* initialize steps_index */
65 g_ptr_array_index(steps_index, i) = g_hash_table_new
66 (g_direct_hash, g_direct_equal);
67 /* initialize steps_content */
68 g_ptr_array_index(steps_content, i) = g_array_new
69 (FALSE, FALSE, sizeof(lookup_value_t));
70 }
71
72 return true;
73 }
74
clear_steps(GPtrArray * steps_index,GPtrArray * steps_content)75 static void clear_steps(GPtrArray * steps_index,
76 GPtrArray * steps_content){
77 /* clear steps_index */
78 for ( size_t i = 0; i < steps_index->len; ++i){
79 GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
80 g_hash_table_destroy(table);
81 g_ptr_array_index(steps_index, i) = NULL;
82 }
83
84 /* free steps_content */
85 for ( size_t i = 0; i < steps_content->len; ++i){
86 GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
87 g_array_free(array, TRUE);
88 g_ptr_array_index(steps_content, i) = NULL;
89 }
90 }
91
PhraseLookup(const gfloat lambda,FacadePhraseTable3 * phrase_table,FacadePhraseIndex * phrase_index,Bigram * system_bigram,Bigram * user_bigram)92 PhraseLookup::PhraseLookup(const gfloat lambda,
93 FacadePhraseTable3 * phrase_table,
94 FacadePhraseIndex * phrase_index,
95 Bigram * system_bigram,
96 Bigram * user_bigram)
97 : bigram_lambda(lambda),
98 unigram_lambda(1. - lambda)
99 {
100 m_phrase_table = phrase_table;
101 m_phrase_index = phrase_index;
102 m_system_bigram = system_bigram;
103 m_user_bigram = user_bigram;
104
105 m_steps_index = g_ptr_array_new();
106 m_steps_content = g_ptr_array_new();
107
108 /* the member variables below are saved in get_best_match call. */
109 m_sentence = NULL;
110 m_sentence_length = 0;
111 }
112
~PhraseLookup()113 PhraseLookup::~PhraseLookup(){
114 clear_steps(m_steps_index, m_steps_content);
115 g_ptr_array_free(m_steps_index, TRUE);
116 g_ptr_array_free(m_steps_content, TRUE);
117 }
118
get_best_match(int sentence_length,ucs4_t sentence[],MatchResult & result)119 bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
120 MatchResult & result){
121 m_sentence_length = sentence_length;
122 m_sentence = sentence;
123 int nstep = m_sentence_length + 1;
124
125 clear_steps(m_steps_index, m_steps_content);
126
127 init_steps(m_steps_index, m_steps_content, nstep);
128
129 populate_prefixes(m_steps_index, m_steps_content);
130
131 PhraseTokens tokens;
132 memset(tokens, 0, sizeof(PhraseTokens));
133 m_phrase_index->prepare_tokens(tokens);
134
135 for ( int i = 0; i < nstep - 1; ++i ){
136 for ( int m = i + 1; m < nstep; ++m ){
137
138 m_phrase_index->clear_tokens(tokens);
139
140 /* do one phrase table search. */
141 int result = m_phrase_table->search(m - i, sentence + i, tokens);
142
143 /* found next phrase */
144 if ( result & SEARCH_OK ) {
145 search_bigram2(i, tokens),
146 search_unigram2(i, tokens);
147 }
148
149 /* no longer phrase */
150 if (!(result & SEARCH_CONTINUED))
151 break;
152 }
153 }
154
155 m_phrase_index->destroy_tokens(tokens);
156
157 return final_step(result);
158 }
159
160 #if 0
161
162 bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
163
164 LookupStepContent lookup_content = (LookupStepContent)
165 g_ptr_array_index(m_steps_content, nstep);
166 if ( 0 == lookup_content->len )
167 return false;
168
169 lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0);
170 /* find the maximum node */
171 for ( size_t i = 1; i < lookup_content->len; ++i ){
172 lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
173 if ( cur_value->m_poss > max_value->m_poss )
174 max_value = cur_value;
175 }
176
177 return unigram_gen_next_step(nstep, max_value, token);
178 }
179
180 bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
181 bool found = false;
182
183 LookupStepContent lookup_content = (LookupStepContent)
184 g_ptr_array_index(m_steps_content, nstep);
185 if ( 0 == lookup_content->len )
186 return false;
187
188 for ( size_t i = 0; i < lookup_content->len; ++i ){
189 lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
190 phrase_token_t index_token = cur_value->m_handles[1];
191 SingleGram * system, * user;
192 m_system_bigram->load(index_token, system);
193 m_user_bigram->load(index_token, user);
194
195 if ( !merge_single_gram(&m_merged_single_gram, system, user) )
196 continue;
197
198 guint32 freq;
199 if ( m_merged_single_gram.get_freq(token, freq) ){
200 guint32 total_freq;
201 m_merged_single_gram.get_total_freq(total_freq);
202 gfloat bigram_poss = freq / (gfloat) total_freq;
203 found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
204 }
205
206 if (system)
207 delete system;
208 if (user)
209 delete user;
210 }
211
212 return found;
213 }
214
215 #endif
216
search_unigram2(int nstep,PhraseTokens tokens)217 bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
218 bool found = false;
219
220 LookupStepContent lookup_content = (LookupStepContent)
221 g_ptr_array_index(m_steps_content, nstep);
222 if ( 0 == lookup_content->len )
223 return found;
224
225 /* find the maximum node */
226 lookup_value_t * max_value = &g_array_index
227 (lookup_content, lookup_value_t, 0);
228
229 for (size_t i = 1; i < lookup_content->len; ++i) {
230 lookup_value_t * cur_value = &g_array_index
231 (lookup_content, lookup_value_t, i);
232 if (cur_value->m_poss > max_value->m_poss)
233 max_value = cur_value;
234 }
235
236 /* iterate over tokens */
237 for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
238 GArray * array = tokens[n];
239 if (NULL == array)
240 continue;
241
242 /* just skip the loop when the length is zero. */
243 for (size_t k = 0; k < array->len; ++k) {
244 phrase_token_t token =
245 g_array_index(array, phrase_token_t, k);
246
247 found = unigram_gen_next_step
248 (nstep, max_value, token) || found;
249 }
250 }
251
252 return found;
253 }
254
search_bigram2(int nstep,PhraseTokens tokens)255 bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
256 bool found = false;
257
258 LookupStepContent lookup_content = (LookupStepContent)
259 g_ptr_array_index(m_steps_content, nstep);
260 if (0 == lookup_content->len)
261 return found;
262
263 for (size_t i = 0; i < lookup_content->len; ++i) {
264 lookup_value_t * cur_value = &g_array_index
265 (lookup_content, lookup_value_t, i);
266 phrase_token_t index_token = cur_value->m_handles[1];
267
268 SingleGram * system = NULL, * user = NULL;
269 m_system_bigram->load(index_token, system);
270 m_user_bigram->load(index_token, user);
271
272 if (!merge_single_gram
273 (&m_merged_single_gram, system, user))
274 continue;
275
276 /* iterate over tokens */
277 for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
278 GArray * array = tokens[n];
279 if (NULL == array)
280 continue;
281
282 /* just skip the loop when the length is zero. */
283 for (size_t k = 0; k < array->len; ++k) {
284 phrase_token_t token =
285 g_array_index(array, phrase_token_t, k);
286
287 guint32 freq = 0;
288 if (m_merged_single_gram.get_freq(token, freq)) {
289 guint32 total_freq = 0;
290 m_merged_single_gram.get_total_freq(total_freq);
291
292 gfloat bigram_poss = freq / (gfloat) total_freq;
293 found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
294 }
295 }
296 }
297
298 if (system)
299 delete system;
300 if (user)
301 delete user;
302 }
303
304 return found;
305 }
306
unigram_gen_next_step(int nstep,lookup_value_t * cur_value,phrase_token_t token)307 bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
308 phrase_token_t token){
309
310 if (m_phrase_index->get_phrase_item(token, m_cached_phrase_item))
311 return false;
312
313 size_t phrase_length = m_cached_phrase_item.get_phrase_length();
314 gdouble elem_poss = m_cached_phrase_item.get_unigram_frequency() / (gdouble)
315 m_phrase_index->get_phrase_index_total_freq();
316 if ( elem_poss < DBL_EPSILON )
317 return false;
318
319 lookup_value_t next_value;
320 next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
321 next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda);
322 next_value.m_last_step = nstep;
323
324 return save_next_step(nstep + phrase_length, cur_value, &next_value);
325 }
326
bigram_gen_next_step(int nstep,lookup_value_t * cur_value,phrase_token_t token,gfloat bigram_poss)327 bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){
328
329 if ( m_phrase_index->get_phrase_item(token, m_cached_phrase_item))
330 return false;
331
332 size_t phrase_length = m_cached_phrase_item.get_phrase_length();
333 gdouble unigram_poss = m_cached_phrase_item.get_unigram_frequency() /
334 (gdouble) m_phrase_index->get_phrase_index_total_freq();
335
336 if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
337 return false;
338
339 lookup_value_t next_value;
340 next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
341 next_value.m_poss = cur_value->m_poss +
342 log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss );
343 next_value.m_last_step = nstep;
344
345 return save_next_step(nstep + phrase_length, cur_value, &next_value);
346 }
347
save_next_step(int next_step_pos,lookup_value_t * cur_value,lookup_value_t * next_value)348 bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){
349
350 LookupStepIndex next_lookup_index = (LookupStepIndex)
351 g_ptr_array_index(m_steps_index, next_step_pos);
352 LookupStepContent next_lookup_content = (LookupStepContent)
353 g_ptr_array_index(m_steps_content, next_step_pos);
354
355 lookup_key_t next_key = next_value->m_handles[1];
356
357 gpointer key = NULL, value = NULL;
358 gboolean lookup_result = g_hash_table_lookup_extended
359 (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
360
361 if (!lookup_result){
362 g_array_append_val(next_lookup_content, *next_value);
363 g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key),
364 GUINT_TO_POINTER(next_lookup_content->len - 1));
365 return true;
366 }else{
367 size_t step_index = GPOINTER_TO_UINT(value);
368 lookup_value_t * orig_next_value = &g_array_index
369 (next_lookup_content, lookup_value_t, step_index);
370
371 if ( orig_next_value->m_poss < next_value->m_poss ){
372 orig_next_value->m_handles[0] = next_value->m_handles[0];
373 assert(orig_next_value->m_handles[1] == next_value->m_handles[1]);
374 orig_next_value->m_poss = next_value->m_poss;
375 orig_next_value->m_last_step = next_value->m_last_step;
376 return true;
377 }
378 return false;
379 }
380 }
381
final_step(MatchResult & result)382 bool PhraseLookup::final_step(MatchResult & result){
383
384 /* reset results */
385 g_array_set_size(result, m_steps_content->len - 1);
386 for ( size_t i = 0; i < result->len; ++i ){
387 phrase_token_t * token = &g_array_index(result, phrase_token_t, i);
388 *token = null_token;
389 }
390
391 /* find max element */
392 size_t last_step_pos = m_steps_content->len - 1;
393 LookupStepContent last_step_content = (LookupStepContent) g_ptr_array_index
394 (m_steps_content, last_step_pos);
395 if ( last_step_content->len == 0 )
396 return false;
397
398 lookup_value_t * max_value = &g_array_index
399 (last_step_content, lookup_value_t, 0);
400 for ( size_t i = 1; i < last_step_content->len; ++i ){
401 lookup_value_t * cur_value = &g_array_index
402 (last_step_content, lookup_value_t, i);
403 if ( cur_value->m_poss > max_value->m_poss )
404 max_value = cur_value;
405 }
406
407 /* backtracing */
408 while( true ){
409 int cur_step_pos = max_value->m_last_step;
410 if ( -1 == cur_step_pos )
411 break;
412
413 phrase_token_t * token = &g_array_index
414 (result, phrase_token_t, cur_step_pos);
415 *token = max_value->m_handles[1];
416
417 phrase_token_t last_token = max_value->m_handles[0];
418 LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos);
419
420 gpointer key = NULL, value = NULL;
421 gboolean result = g_hash_table_lookup_extended
422 (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
423 if ( !result )
424 return false;
425
426 LookupStepContent lookup_step_content = (LookupStepContent)
427 g_ptr_array_index(m_steps_content, cur_step_pos);
428 max_value = &g_array_index
429 (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
430 }
431
432 /* no need to reverse the result */
433 return true;
434 }
435