1 /*
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *
5  *  Copyright (C) 2010 Peng Wu
6  *
7  *  This program is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include <math.h>
22 #include "stl_lite.h"
23 #include "novel_types.h"
24 #include "phrase_index.h"
25 #include "facade_phrase_table3.h"
26 #include "phrase_lookup.h"
27 
28 using namespace pinyin;
29 
30 
31 /*
32 const gfloat PhraseLookup::bigram_lambda = lambda;
33 const gfloat PhraseLookup::unigram_lambda = 1 - lambda;
34 */
35 
populate_prefixes(GPtrArray * steps_index,GPtrArray * steps_content)36 static bool populate_prefixes(GPtrArray * steps_index,
37                               GPtrArray * steps_content) {
38 
39     lookup_key_t initial_key = sentence_start;
40     lookup_value_t initial_value(log(1.f));
41     initial_value.m_handles[1] = sentence_start;
42 
43     LookupStepContent initial_step_content = (LookupStepContent)
44         g_ptr_array_index(steps_content, 0);
45     g_array_append_val(initial_step_content, initial_value);
46 
47     LookupStepIndex initial_step_index = (LookupStepIndex)
48         g_ptr_array_index(steps_index, 0);
49     g_hash_table_insert(initial_step_index, GUINT_TO_POINTER(initial_key),
50                         GUINT_TO_POINTER(initial_step_content->len - 1));
51 
52     return true;
53 }
54 
init_steps(GPtrArray * steps_index,GPtrArray * steps_content,int nstep)55 static bool init_steps(GPtrArray * steps_index,
56                        GPtrArray * steps_content,
57                        int nstep) {
58 
59     /* add null start step */
60     g_ptr_array_set_size(steps_index, nstep);
61     g_ptr_array_set_size(steps_content, nstep);
62 
63     for ( int i = 0; i < nstep; ++i ){
64         /* initialize steps_index */
65         g_ptr_array_index(steps_index, i) = g_hash_table_new
66             (g_direct_hash, g_direct_equal);
67         /* initialize steps_content */
68         g_ptr_array_index(steps_content, i) = g_array_new
69             (FALSE, FALSE, sizeof(lookup_value_t));
70     }
71 
72     return true;
73 }
74 
clear_steps(GPtrArray * steps_index,GPtrArray * steps_content)75 static void clear_steps(GPtrArray * steps_index,
76                         GPtrArray * steps_content){
77     /* clear steps_index */
78     for ( size_t i = 0; i < steps_index->len; ++i){
79         GHashTable * table = (GHashTable *) g_ptr_array_index(steps_index, i);
80         g_hash_table_destroy(table);
81         g_ptr_array_index(steps_index, i) = NULL;
82     }
83 
84     /* free steps_content */
85     for ( size_t i = 0; i < steps_content->len; ++i){
86         GArray * array = (GArray *) g_ptr_array_index(steps_content, i);
87         g_array_free(array, TRUE);
88         g_ptr_array_index(steps_content, i) = NULL;
89     }
90 }
91 
PhraseLookup(const gfloat lambda,FacadePhraseTable3 * phrase_table,FacadePhraseIndex * phrase_index,Bigram * system_bigram,Bigram * user_bigram)92 PhraseLookup::PhraseLookup(const gfloat lambda,
93                            FacadePhraseTable3 * phrase_table,
94                            FacadePhraseIndex * phrase_index,
95                            Bigram * system_bigram,
96                            Bigram * user_bigram)
97     : bigram_lambda(lambda),
98       unigram_lambda(1. - lambda)
99 {
100     m_phrase_table = phrase_table;
101     m_phrase_index = phrase_index;
102     m_system_bigram = system_bigram;
103     m_user_bigram = user_bigram;
104 
105     m_steps_index = g_ptr_array_new();
106     m_steps_content = g_ptr_array_new();
107 
108     /* the member variables below are saved in get_best_match call. */
109     m_sentence = NULL;
110     m_sentence_length = 0;
111 }
112 
~PhraseLookup()113 PhraseLookup::~PhraseLookup(){
114     clear_steps(m_steps_index, m_steps_content);
115     g_ptr_array_free(m_steps_index, TRUE);
116     g_ptr_array_free(m_steps_content, TRUE);
117 }
118 
get_best_match(int sentence_length,ucs4_t sentence[],MatchResult & result)119 bool PhraseLookup::get_best_match(int sentence_length, ucs4_t sentence[],
120                                   MatchResult & result){
121     m_sentence_length = sentence_length;
122     m_sentence = sentence;
123     int nstep = m_sentence_length + 1;
124 
125     clear_steps(m_steps_index, m_steps_content);
126 
127     init_steps(m_steps_index, m_steps_content, nstep);
128 
129     populate_prefixes(m_steps_index, m_steps_content);
130 
131     PhraseTokens tokens;
132     memset(tokens, 0, sizeof(PhraseTokens));
133     m_phrase_index->prepare_tokens(tokens);
134 
135     for ( int i = 0; i < nstep - 1; ++i ){
136         for ( int m = i + 1; m < nstep; ++m ){
137 
138             m_phrase_index->clear_tokens(tokens);
139 
140             /* do one phrase table search. */
141             int result = m_phrase_table->search(m - i, sentence + i, tokens);
142 
143             /* found next phrase */
144             if ( result & SEARCH_OK ) {
145                 search_bigram2(i, tokens),
146                     search_unigram2(i, tokens);
147             }
148 
149             /* no longer phrase */
150             if (!(result & SEARCH_CONTINUED))
151                 break;
152         }
153     }
154 
155     m_phrase_index->destroy_tokens(tokens);
156 
157     return final_step(result);
158 }
159 
160 #if 0
161 
162 bool PhraseLookup::search_unigram(int nstep, phrase_token_t token){
163 
164     LookupStepContent lookup_content = (LookupStepContent)
165         g_ptr_array_index(m_steps_content, nstep);
166     if ( 0 == lookup_content->len )
167         return false;
168 
169     lookup_value_t * max_value = &g_array_index(lookup_content, lookup_value_t, 0);
170     /* find the maximum node */
171     for ( size_t i = 1; i < lookup_content->len; ++i ){
172         lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
173         if ( cur_value->m_poss > max_value->m_poss )
174             max_value = cur_value;
175     }
176 
177     return unigram_gen_next_step(nstep, max_value, token);
178 }
179 
180 bool PhraseLookup::search_bigram(int nstep, phrase_token_t token){
181     bool found = false;
182 
183     LookupStepContent lookup_content = (LookupStepContent)
184         g_ptr_array_index(m_steps_content, nstep);
185     if ( 0 == lookup_content->len )
186         return false;
187 
188     for ( size_t i = 0; i < lookup_content->len; ++i ){
189         lookup_value_t * cur_value = &g_array_index(lookup_content, lookup_value_t, i);
190         phrase_token_t index_token = cur_value->m_handles[1];
191         SingleGram * system, * user;
192         m_system_bigram->load(index_token, system);
193         m_user_bigram->load(index_token, user);
194 
195         if ( !merge_single_gram(&m_merged_single_gram, system, user) )
196             continue;
197 
198         guint32 freq;
199         if ( m_merged_single_gram.get_freq(token, freq) ){
200             guint32 total_freq;
201             m_merged_single_gram.get_total_freq(total_freq);
202             gfloat bigram_poss = freq / (gfloat) total_freq;
203             found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
204         }
205 
206         if (system)
207             delete system;
208         if (user)
209             delete user;
210     }
211 
212     return found;
213 }
214 
215 #endif
216 
search_unigram2(int nstep,PhraseTokens tokens)217 bool PhraseLookup::search_unigram2(int nstep, PhraseTokens tokens){
218     bool found = false;
219 
220     LookupStepContent lookup_content = (LookupStepContent)
221         g_ptr_array_index(m_steps_content, nstep);
222     if ( 0 == lookup_content->len )
223         return found;
224 
225     /* find the maximum node */
226     lookup_value_t * max_value = &g_array_index
227         (lookup_content, lookup_value_t, 0);
228 
229     for (size_t i = 1; i < lookup_content->len; ++i) {
230         lookup_value_t * cur_value = &g_array_index
231             (lookup_content, lookup_value_t, i);
232         if (cur_value->m_poss > max_value->m_poss)
233             max_value = cur_value;
234     }
235 
236     /* iterate over tokens */
237     for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
238         GArray * array = tokens[n];
239         if (NULL == array)
240             continue;
241 
242         /* just skip the loop when the length is zero. */
243         for (size_t k = 0; k < array->len; ++k) {
244             phrase_token_t token =
245                 g_array_index(array, phrase_token_t, k);
246 
247             found = unigram_gen_next_step
248                 (nstep, max_value, token) || found;
249         }
250     }
251 
252     return found;
253 }
254 
search_bigram2(int nstep,PhraseTokens tokens)255 bool PhraseLookup::search_bigram2(int nstep, PhraseTokens tokens){
256     bool found = false;
257 
258     LookupStepContent lookup_content = (LookupStepContent)
259         g_ptr_array_index(m_steps_content, nstep);
260     if (0 == lookup_content->len)
261         return found;
262 
263     for (size_t i = 0; i < lookup_content->len; ++i) {
264         lookup_value_t * cur_value = &g_array_index
265             (lookup_content, lookup_value_t, i);
266         phrase_token_t index_token = cur_value->m_handles[1];
267 
268         SingleGram * system = NULL, * user = NULL;
269         m_system_bigram->load(index_token, system);
270         m_user_bigram->load(index_token, user);
271 
272         if (!merge_single_gram
273             (&m_merged_single_gram, system, user))
274             continue;
275 
276         /* iterate over tokens */
277         for (size_t n = 0; n < PHRASE_INDEX_LIBRARY_COUNT; ++n) {
278             GArray * array = tokens[n];
279             if (NULL == array)
280                 continue;
281 
282             /* just skip the loop when the length is zero. */
283             for (size_t k = 0; k < array->len; ++k) {
284                 phrase_token_t token =
285                     g_array_index(array, phrase_token_t, k);
286 
287                 guint32 freq = 0;
288                 if (m_merged_single_gram.get_freq(token, freq)) {
289                     guint32 total_freq = 0;
290                     m_merged_single_gram.get_total_freq(total_freq);
291 
292                     gfloat bigram_poss = freq / (gfloat) total_freq;
293                     found = bigram_gen_next_step(nstep, cur_value, token, bigram_poss) || found;
294                 }
295             }
296         }
297 
298         if (system)
299             delete system;
300         if (user)
301             delete user;
302     }
303 
304     return found;
305 }
306 
unigram_gen_next_step(int nstep,lookup_value_t * cur_value,phrase_token_t token)307 bool PhraseLookup::unigram_gen_next_step(int nstep, lookup_value_t * cur_value,
308 phrase_token_t token){
309 
310     if (m_phrase_index->get_phrase_item(token, m_cached_phrase_item))
311         return false;
312 
313     size_t phrase_length = m_cached_phrase_item.get_phrase_length();
314     gdouble elem_poss = m_cached_phrase_item.get_unigram_frequency() / (gdouble)
315         m_phrase_index->get_phrase_index_total_freq();
316     if ( elem_poss < DBL_EPSILON )
317         return false;
318 
319     lookup_value_t next_value;
320     next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
321     next_value.m_poss = cur_value->m_poss + log(elem_poss * unigram_lambda);
322     next_value.m_last_step = nstep;
323 
324     return save_next_step(nstep + phrase_length, cur_value, &next_value);
325 }
326 
bigram_gen_next_step(int nstep,lookup_value_t * cur_value,phrase_token_t token,gfloat bigram_poss)327 bool PhraseLookup::bigram_gen_next_step(int nstep, lookup_value_t * cur_value, phrase_token_t token, gfloat bigram_poss){
328 
329     if ( m_phrase_index->get_phrase_item(token, m_cached_phrase_item))
330         return false;
331 
332     size_t phrase_length = m_cached_phrase_item.get_phrase_length();
333     gdouble unigram_poss = m_cached_phrase_item.get_unigram_frequency() /
334         (gdouble) m_phrase_index->get_phrase_index_total_freq();
335 
336     if ( bigram_poss < FLT_EPSILON && unigram_poss < DBL_EPSILON )
337         return false;
338 
339     lookup_value_t next_value;
340     next_value.m_handles[0] = cur_value->m_handles[1]; next_value.m_handles[1] = token;
341     next_value.m_poss = cur_value->m_poss +
342         log( bigram_lambda * bigram_poss + unigram_lambda * unigram_poss );
343     next_value.m_last_step = nstep;
344 
345     return save_next_step(nstep + phrase_length, cur_value, &next_value);
346 }
347 
save_next_step(int next_step_pos,lookup_value_t * cur_value,lookup_value_t * next_value)348 bool PhraseLookup::save_next_step(int next_step_pos, lookup_value_t * cur_value, lookup_value_t * next_value){
349 
350     LookupStepIndex next_lookup_index = (LookupStepIndex)
351         g_ptr_array_index(m_steps_index, next_step_pos);
352     LookupStepContent next_lookup_content = (LookupStepContent)
353         g_ptr_array_index(m_steps_content, next_step_pos);
354 
355     lookup_key_t next_key = next_value->m_handles[1];
356 
357     gpointer key = NULL, value = NULL;
358     gboolean lookup_result = g_hash_table_lookup_extended
359         (next_lookup_index, GUINT_TO_POINTER(next_key), &key, &value);
360 
361     if (!lookup_result){
362         g_array_append_val(next_lookup_content, *next_value);
363         g_hash_table_insert(next_lookup_index, GUINT_TO_POINTER(next_key),
364                             GUINT_TO_POINTER(next_lookup_content->len - 1));
365         return true;
366     }else{
367         size_t step_index = GPOINTER_TO_UINT(value);
368         lookup_value_t * orig_next_value = &g_array_index
369             (next_lookup_content, lookup_value_t, step_index);
370 
371         if ( orig_next_value->m_poss < next_value->m_poss ){
372             orig_next_value->m_handles[0] = next_value->m_handles[0];
373             assert(orig_next_value->m_handles[1] == next_value->m_handles[1]);
374             orig_next_value->m_poss = next_value->m_poss;
375             orig_next_value->m_last_step = next_value->m_last_step;
376             return true;
377         }
378         return false;
379     }
380 }
381 
final_step(MatchResult & result)382 bool PhraseLookup::final_step(MatchResult & result){
383 
384     /* reset results */
385     g_array_set_size(result, m_steps_content->len - 1);
386     for ( size_t i = 0; i < result->len; ++i ){
387         phrase_token_t * token = &g_array_index(result, phrase_token_t, i);
388         *token = null_token;
389     }
390 
391     /* find max element */
392     size_t last_step_pos = m_steps_content->len - 1;
393     LookupStepContent last_step_content =  (LookupStepContent) g_ptr_array_index
394         (m_steps_content, last_step_pos);
395     if ( last_step_content->len == 0 )
396         return false;
397 
398     lookup_value_t * max_value = &g_array_index
399         (last_step_content, lookup_value_t, 0);
400     for ( size_t i = 1; i < last_step_content->len; ++i ){
401         lookup_value_t * cur_value = &g_array_index
402             (last_step_content, lookup_value_t, i);
403         if ( cur_value->m_poss > max_value->m_poss )
404             max_value = cur_value;
405     }
406 
407     /* backtracing */
408     while( true ){
409         int cur_step_pos = max_value->m_last_step;
410         if ( -1 == cur_step_pos )
411             break;
412 
413         phrase_token_t * token = &g_array_index
414             (result, phrase_token_t, cur_step_pos);
415         *token = max_value->m_handles[1];
416 
417         phrase_token_t last_token = max_value->m_handles[0];
418         LookupStepIndex lookup_step_index = (LookupStepIndex) g_ptr_array_index(m_steps_index, cur_step_pos);
419 
420         gpointer key = NULL, value = NULL;
421         gboolean result = g_hash_table_lookup_extended
422             (lookup_step_index, GUINT_TO_POINTER(last_token), &key, &value);
423         if ( !result )
424             return false;
425 
426         LookupStepContent lookup_step_content = (LookupStepContent)
427             g_ptr_array_index(m_steps_content, cur_step_pos);
428         max_value = &g_array_index
429             (lookup_step_content, lookup_value_t, GPOINTER_TO_UINT(value));
430     }
431 
432     /* no need to reverse the result */
433     return true;
434 }
435