1 /*
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *
5  *  Copyright (C) 2006-2007 Peng Wu
6  *
7  *  This program is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include <stdio.h>
22 #include <errno.h>
23 #include <glib.h>
24 #include <glib/gstdio.h>
25 #include "memory_chunk.h"
26 #include "novel_types.h"
27 #include "ngram.h"
28 
29 using namespace pinyin;
30 
31 struct SingleGramItem{
32     phrase_token_t m_token;
33     guint32 m_freq;
34 };
35 
SingleGram()36 SingleGram::SingleGram(){
37     m_chunk.set_size(sizeof(guint32));
38     memset(m_chunk.begin(), 0, sizeof(guint32));
39 }
40 
SingleGram(void * buffer,size_t length,bool copy)41 SingleGram::SingleGram(void * buffer, size_t length, bool copy){
42     if (copy)
43         m_chunk.set_content(0, buffer, length);
44     else
45         m_chunk.set_chunk(buffer, length, NULL);
46 }
47 
get_total_freq(guint32 & total) const48 bool SingleGram::get_total_freq(guint32 & total) const{
49     char * buf_begin = (char *)m_chunk.begin();
50     total = *((guint32 *)buf_begin);
51     return true;
52 }
53 
set_total_freq(guint32 total)54 bool SingleGram::set_total_freq(guint32 total){
55     char * buf_begin = (char *)m_chunk.begin();
56     *((guint32 *)buf_begin) = total;
57     return true;
58 }
59 
get_length()60 guint32 SingleGram::get_length(){
61     /* get the number of items. */
62     const SingleGramItem * begin = (const SingleGramItem *)
63         ((const char *)(m_chunk.begin()) + sizeof(guint32));
64     const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
65 
66     const guint32 length = end - begin;
67 
68     if (0 == length) {
69         /* no items here, total freq should be zero. */
70         guint32 total_freq = 0;
71         assert(get_total_freq(total_freq));
72         assert(0 == total_freq);
73     }
74 
75     return length;
76 }
77 
mask_out(phrase_token_t mask,phrase_token_t value)78 guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
79     guint32 removed_items = 0;
80 
81     guint32 total_freq = 0;
82     assert(get_total_freq(total_freq));
83 
84     const SingleGramItem * begin = (const SingleGramItem *)
85         ((const char *)(m_chunk.begin()) + sizeof(guint32));
86     const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
87 
88     for (const SingleGramItem * cur = begin; cur != end; ++cur) {
89         if ((cur->m_token & mask) != value)
90             continue;
91 
92         total_freq -= cur->m_freq;
93         size_t offset = sizeof(guint32) +
94             sizeof(SingleGramItem) * (cur - begin);
95         m_chunk.remove_content(offset, sizeof(SingleGramItem));
96 
97         /* update chunk end. */
98         end = (const SingleGramItem *) m_chunk.end();
99         ++removed_items;
100         --cur;
101     }
102 
103     assert(set_total_freq(total_freq));
104     return removed_items;
105 }
106 
prune()107 bool SingleGram::prune(){
108     assert(false);
109 #if 0
110     SingleGramItem * begin = (SingleGramItem *)
111 	((const char *)(m_chunk.begin()) + sizeof(guint32));
112     SingleGramItem * end = (SingleGramItem *)m_chunk.end();
113 
114     size_t nitem = 0;
115     for ( SingleGramItem * cur = begin; cur != end; ++cur){
116 	cur->m_freq--;
117 	nitem++;
118 	if ( cur->m_freq == 0 ){
119 	    size_t offset = sizeof(guint32) + (cur - begin)
120 		* sizeof(SingleGramItem) ;
121 	    m_chunk.remove_content(offset, sizeof(SingleGramItem));
122 	}
123     }
124     guint32 total_freq;
125     assert(get_total_freq(total_freq));
126     assert(set_total_freq(total_freq - nitem));
127 #endif
128 	return true;
129 }
130 
token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs)131 static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
132     return lhs.m_token < rhs.m_token;
133 }
134 
retrieve_all(BigramPhraseWithCountArray array) const135 bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
136     const {
137     const SingleGramItem * begin = (const SingleGramItem *)
138         ((const char *)(m_chunk.begin()) + sizeof(guint32));
139     const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
140 
141     guint32 total_freq;
142     BigramPhraseItemWithCount bigram_item_with_count;
143     assert(get_total_freq(total_freq));
144 
145     for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
146         bigram_item_with_count.m_token = cur_item->m_token;
147         bigram_item_with_count.m_count = cur_item->m_freq;
148         bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
149         g_array_append_val(array, bigram_item_with_count);
150     }
151 
152     return true;
153 }
154 
search(PhraseIndexRange * range,BigramPhraseArray array) const155 bool SingleGram::search(/* in */ PhraseIndexRange * range,
156 			/* out */ BigramPhraseArray array) const {
157     const SingleGramItem * begin = (const SingleGramItem *)
158 	((const char *)(m_chunk.begin()) + sizeof(guint32));
159     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
160 
161     SingleGramItem compare_item;
162     compare_item.m_token = range->m_range_begin;
163     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
164 
165     guint32 total_freq;
166     BigramPhraseItem bigram_item;
167     assert(get_total_freq(total_freq));
168 
169     for ( ; cur_item != end; ++cur_item){
170 	if ( cur_item->m_token >= range->m_range_end )
171 	    break;
172 	bigram_item.m_token = cur_item->m_token;
173 	bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
174 	g_array_append_val(array, bigram_item);
175     }
176 
177     return true;
178 }
179 
insert_freq(phrase_token_t token,guint32 freq)180 bool SingleGram::insert_freq( /* in */ phrase_token_t token,
181                               /* in */ guint32 freq){
182     SingleGramItem * begin = (SingleGramItem *)
183         ((const char *)(m_chunk.begin()) + sizeof(guint32));
184     SingleGramItem * end = (SingleGramItem *) m_chunk.end();
185     SingleGramItem compare_item;
186     compare_item.m_token = token;
187     SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
188 
189     SingleGramItem insert_item;
190     insert_item.m_token = token;
191     insert_item.m_freq = freq;
192     for ( ; cur_item != end; ++cur_item ){
193         if ( cur_item->m_token > token ){
194             size_t offset = sizeof(guint32) +
195                 sizeof(SingleGramItem) * (cur_item - begin);
196             m_chunk.insert_content(offset, &insert_item,
197                                    sizeof(SingleGramItem));
198             return true;
199         }
200         if ( cur_item->m_token == token ){
201             return false;
202         }
203     }
204     m_chunk.insert_content(m_chunk.size(), &insert_item,
205                            sizeof(SingleGramItem));
206     return true;
207 }
208 
remove_freq(phrase_token_t token,guint32 & freq)209 bool SingleGram::remove_freq( /* in */ phrase_token_t token,
210                               /* out */ guint32 & freq){
211     freq = 0;
212     const SingleGramItem * begin = (const SingleGramItem *)
213         ((const char *)(m_chunk.begin()) + sizeof(guint32));
214     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
215     SingleGramItem compare_item;
216     compare_item.m_token = token;
217     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
218 
219     for ( ; cur_item != end; ++cur_item ){
220         if ( cur_item->m_token > token )
221             return false;
222         if ( cur_item->m_token == token ){
223             freq = cur_item -> m_freq;
224             size_t offset = sizeof(guint32) +
225                 sizeof(SingleGramItem) * (cur_item - begin);
226             m_chunk.remove_content(offset, sizeof(SingleGramItem));
227             return true;
228         }
229     }
230     return false;
231 }
232 
get_freq(phrase_token_t token,guint32 & freq) const233 bool SingleGram::get_freq(/* in */ phrase_token_t token,
234                           /* out */ guint32 & freq) const {
235     freq = 0;
236     const SingleGramItem * begin = (const SingleGramItem *)
237 	((const char *)(m_chunk.begin()) + sizeof(guint32));
238     const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
239     SingleGramItem compare_item;
240     compare_item.m_token = token;
241     const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
242 
243     for ( ; cur_item != end; ++cur_item){
244 	if ( cur_item->m_token > token )
245 	    return false;
246 	if ( cur_item->m_token == token ){
247 	    freq = cur_item -> m_freq;
248 	    return true;
249 	}
250     }
251     return false;
252 }
253 
set_freq(phrase_token_t token,guint32 freq)254 bool SingleGram::set_freq( /* in */ phrase_token_t token,
255 			   /* in */ guint32 freq){
256     SingleGramItem * begin = (SingleGramItem *)
257 	((const char *)(m_chunk.begin()) + sizeof(guint32));
258     SingleGramItem * end = (SingleGramItem *)m_chunk.end();
259     SingleGramItem compare_item;
260     compare_item.m_token = token;
261     SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
262 
263     for ( ;cur_item != end; ++cur_item){
264 	if ( cur_item->m_token > token ){
265 	    return false;
266 	}
267 	if ( cur_item->m_token == token ){
268 	    cur_item -> m_freq = freq;
269 	    return true;
270 	}
271     }
272     return false;
273 }
274 
275 
276 namespace pinyin{
277 
278 /* merge origin system info and delta user info */
merge_single_gram(SingleGram * merged,const SingleGram * system,const SingleGram * user)279 bool merge_single_gram(SingleGram * merged, const SingleGram * system,
280                        const SingleGram * user){
281     if (NULL == system && NULL == user)
282         return false;
283 
284     MemoryChunk & merged_chunk = merged->m_chunk;
285 
286     if (NULL == system) {
287         merged_chunk.set_chunk(user->m_chunk.begin(),
288                                user->m_chunk.size(), NULL);
289         return true;
290     }
291 
292     if (NULL == user) {
293         merged_chunk.set_chunk(system->m_chunk.begin(),
294                                system->m_chunk.size(), NULL);
295         return true;
296     }
297 
298     /* clear merged. */
299     merged_chunk.set_size(sizeof(guint32));
300 
301     /* merge the origin info and delta info */
302     guint32 system_total, user_total;
303     assert(system->get_total_freq(system_total));
304     assert(user->get_total_freq(user_total));
305     const guint32 merged_total = system_total + user_total;
306     merged_chunk.set_content(0, &merged_total, sizeof(guint32));
307 
308     const SingleGramItem * cur_system = (const SingleGramItem *)
309         (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
310     const SingleGramItem * system_end = (const SingleGramItem *)
311         system->m_chunk.end();
312 
313     const SingleGramItem * cur_user = (const SingleGramItem *)
314         (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
315     const SingleGramItem * user_end = (const SingleGramItem *)
316         user->m_chunk.end();
317 
318     while (cur_system < system_end && cur_user < user_end) {
319 
320         if (cur_system->m_token < cur_user->m_token) {
321             /* do append operation here */
322             merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
323             cur_system++;
324         } else if (cur_system->m_token > cur_user->m_token) {
325             /* do append operation here */
326             merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
327             cur_user++;
328         } else {
329             assert(cur_system->m_token == cur_user->m_token);
330 
331             SingleGramItem merged_item;
332             merged_item.m_token = cur_system->m_token;
333             merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
334 
335             merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
336             cur_system++; cur_user++;
337         }
338     }
339 
340     /* add remained items. */
341     while (cur_system < system_end) {
342         merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
343         cur_system++;
344     }
345 
346     while (cur_user < user_end) {
347         merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
348         cur_user++;
349     }
350 
351     return true;
352 }
353 
354 };
355