1 /*
2 * libpinyin
3 * Library to deal with pinyin.
4 *
5 * Copyright (C) 2006-2007 Peng Wu
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <stdio.h>
22 #include <errno.h>
23 #include <glib.h>
24 #include <glib/gstdio.h>
25 #include "memory_chunk.h"
26 #include "novel_types.h"
27 #include "ngram.h"
28
29 using namespace pinyin;
30
31 struct SingleGramItem{
32 phrase_token_t m_token;
33 guint32 m_freq;
34 };
35
SingleGram()36 SingleGram::SingleGram(){
37 m_chunk.set_size(sizeof(guint32));
38 memset(m_chunk.begin(), 0, sizeof(guint32));
39 }
40
SingleGram(void * buffer,size_t length,bool copy)41 SingleGram::SingleGram(void * buffer, size_t length, bool copy){
42 if (copy)
43 m_chunk.set_content(0, buffer, length);
44 else
45 m_chunk.set_chunk(buffer, length, NULL);
46 }
47
get_total_freq(guint32 & total) const48 bool SingleGram::get_total_freq(guint32 & total) const{
49 char * buf_begin = (char *)m_chunk.begin();
50 total = *((guint32 *)buf_begin);
51 return true;
52 }
53
set_total_freq(guint32 total)54 bool SingleGram::set_total_freq(guint32 total){
55 char * buf_begin = (char *)m_chunk.begin();
56 *((guint32 *)buf_begin) = total;
57 return true;
58 }
59
get_length()60 guint32 SingleGram::get_length(){
61 /* get the number of items. */
62 const SingleGramItem * begin = (const SingleGramItem *)
63 ((const char *)(m_chunk.begin()) + sizeof(guint32));
64 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
65
66 const guint32 length = end - begin;
67
68 if (0 == length) {
69 /* no items here, total freq should be zero. */
70 guint32 total_freq = 0;
71 assert(get_total_freq(total_freq));
72 assert(0 == total_freq);
73 }
74
75 return length;
76 }
77
mask_out(phrase_token_t mask,phrase_token_t value)78 guint32 SingleGram::mask_out(phrase_token_t mask, phrase_token_t value){
79 guint32 removed_items = 0;
80
81 guint32 total_freq = 0;
82 assert(get_total_freq(total_freq));
83
84 const SingleGramItem * begin = (const SingleGramItem *)
85 ((const char *)(m_chunk.begin()) + sizeof(guint32));
86 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
87
88 for (const SingleGramItem * cur = begin; cur != end; ++cur) {
89 if ((cur->m_token & mask) != value)
90 continue;
91
92 total_freq -= cur->m_freq;
93 size_t offset = sizeof(guint32) +
94 sizeof(SingleGramItem) * (cur - begin);
95 m_chunk.remove_content(offset, sizeof(SingleGramItem));
96
97 /* update chunk end. */
98 end = (const SingleGramItem *) m_chunk.end();
99 ++removed_items;
100 --cur;
101 }
102
103 assert(set_total_freq(total_freq));
104 return removed_items;
105 }
106
prune()107 bool SingleGram::prune(){
108 assert(false);
109 #if 0
110 SingleGramItem * begin = (SingleGramItem *)
111 ((const char *)(m_chunk.begin()) + sizeof(guint32));
112 SingleGramItem * end = (SingleGramItem *)m_chunk.end();
113
114 size_t nitem = 0;
115 for ( SingleGramItem * cur = begin; cur != end; ++cur){
116 cur->m_freq--;
117 nitem++;
118 if ( cur->m_freq == 0 ){
119 size_t offset = sizeof(guint32) + (cur - begin)
120 * sizeof(SingleGramItem) ;
121 m_chunk.remove_content(offset, sizeof(SingleGramItem));
122 }
123 }
124 guint32 total_freq;
125 assert(get_total_freq(total_freq));
126 assert(set_total_freq(total_freq - nitem));
127 #endif
128 return true;
129 }
130
token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs)131 static bool token_less_than(const SingleGramItem & lhs,const SingleGramItem & rhs){
132 return lhs.m_token < rhs.m_token;
133 }
134
retrieve_all(BigramPhraseWithCountArray array) const135 bool SingleGram::retrieve_all(/* out */ BigramPhraseWithCountArray array)
136 const {
137 const SingleGramItem * begin = (const SingleGramItem *)
138 ((const char *)(m_chunk.begin()) + sizeof(guint32));
139 const SingleGramItem * end = (const SingleGramItem *) m_chunk.end();
140
141 guint32 total_freq;
142 BigramPhraseItemWithCount bigram_item_with_count;
143 assert(get_total_freq(total_freq));
144
145 for ( const SingleGramItem * cur_item = begin; cur_item != end; ++cur_item){
146 bigram_item_with_count.m_token = cur_item->m_token;
147 bigram_item_with_count.m_count = cur_item->m_freq;
148 bigram_item_with_count.m_freq = cur_item->m_freq / (gfloat)total_freq;
149 g_array_append_val(array, bigram_item_with_count);
150 }
151
152 return true;
153 }
154
search(PhraseIndexRange * range,BigramPhraseArray array) const155 bool SingleGram::search(/* in */ PhraseIndexRange * range,
156 /* out */ BigramPhraseArray array) const {
157 const SingleGramItem * begin = (const SingleGramItem *)
158 ((const char *)(m_chunk.begin()) + sizeof(guint32));
159 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
160
161 SingleGramItem compare_item;
162 compare_item.m_token = range->m_range_begin;
163 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
164
165 guint32 total_freq;
166 BigramPhraseItem bigram_item;
167 assert(get_total_freq(total_freq));
168
169 for ( ; cur_item != end; ++cur_item){
170 if ( cur_item->m_token >= range->m_range_end )
171 break;
172 bigram_item.m_token = cur_item->m_token;
173 bigram_item.m_freq = cur_item->m_freq / (gfloat)total_freq;
174 g_array_append_val(array, bigram_item);
175 }
176
177 return true;
178 }
179
insert_freq(phrase_token_t token,guint32 freq)180 bool SingleGram::insert_freq( /* in */ phrase_token_t token,
181 /* in */ guint32 freq){
182 SingleGramItem * begin = (SingleGramItem *)
183 ((const char *)(m_chunk.begin()) + sizeof(guint32));
184 SingleGramItem * end = (SingleGramItem *) m_chunk.end();
185 SingleGramItem compare_item;
186 compare_item.m_token = token;
187 SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
188
189 SingleGramItem insert_item;
190 insert_item.m_token = token;
191 insert_item.m_freq = freq;
192 for ( ; cur_item != end; ++cur_item ){
193 if ( cur_item->m_token > token ){
194 size_t offset = sizeof(guint32) +
195 sizeof(SingleGramItem) * (cur_item - begin);
196 m_chunk.insert_content(offset, &insert_item,
197 sizeof(SingleGramItem));
198 return true;
199 }
200 if ( cur_item->m_token == token ){
201 return false;
202 }
203 }
204 m_chunk.insert_content(m_chunk.size(), &insert_item,
205 sizeof(SingleGramItem));
206 return true;
207 }
208
remove_freq(phrase_token_t token,guint32 & freq)209 bool SingleGram::remove_freq( /* in */ phrase_token_t token,
210 /* out */ guint32 & freq){
211 freq = 0;
212 const SingleGramItem * begin = (const SingleGramItem *)
213 ((const char *)(m_chunk.begin()) + sizeof(guint32));
214 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
215 SingleGramItem compare_item;
216 compare_item.m_token = token;
217 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
218
219 for ( ; cur_item != end; ++cur_item ){
220 if ( cur_item->m_token > token )
221 return false;
222 if ( cur_item->m_token == token ){
223 freq = cur_item -> m_freq;
224 size_t offset = sizeof(guint32) +
225 sizeof(SingleGramItem) * (cur_item - begin);
226 m_chunk.remove_content(offset, sizeof(SingleGramItem));
227 return true;
228 }
229 }
230 return false;
231 }
232
get_freq(phrase_token_t token,guint32 & freq) const233 bool SingleGram::get_freq(/* in */ phrase_token_t token,
234 /* out */ guint32 & freq) const {
235 freq = 0;
236 const SingleGramItem * begin = (const SingleGramItem *)
237 ((const char *)(m_chunk.begin()) + sizeof(guint32));
238 const SingleGramItem * end = (const SingleGramItem *)m_chunk.end();
239 SingleGramItem compare_item;
240 compare_item.m_token = token;
241 const SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
242
243 for ( ; cur_item != end; ++cur_item){
244 if ( cur_item->m_token > token )
245 return false;
246 if ( cur_item->m_token == token ){
247 freq = cur_item -> m_freq;
248 return true;
249 }
250 }
251 return false;
252 }
253
set_freq(phrase_token_t token,guint32 freq)254 bool SingleGram::set_freq( /* in */ phrase_token_t token,
255 /* in */ guint32 freq){
256 SingleGramItem * begin = (SingleGramItem *)
257 ((const char *)(m_chunk.begin()) + sizeof(guint32));
258 SingleGramItem * end = (SingleGramItem *)m_chunk.end();
259 SingleGramItem compare_item;
260 compare_item.m_token = token;
261 SingleGramItem * cur_item = std_lite::lower_bound(begin, end, compare_item, token_less_than);
262
263 for ( ;cur_item != end; ++cur_item){
264 if ( cur_item->m_token > token ){
265 return false;
266 }
267 if ( cur_item->m_token == token ){
268 cur_item -> m_freq = freq;
269 return true;
270 }
271 }
272 return false;
273 }
274
275
276 namespace pinyin{
277
278 /* merge origin system info and delta user info */
merge_single_gram(SingleGram * merged,const SingleGram * system,const SingleGram * user)279 bool merge_single_gram(SingleGram * merged, const SingleGram * system,
280 const SingleGram * user){
281 if (NULL == system && NULL == user)
282 return false;
283
284 MemoryChunk & merged_chunk = merged->m_chunk;
285
286 if (NULL == system) {
287 merged_chunk.set_chunk(user->m_chunk.begin(),
288 user->m_chunk.size(), NULL);
289 return true;
290 }
291
292 if (NULL == user) {
293 merged_chunk.set_chunk(system->m_chunk.begin(),
294 system->m_chunk.size(), NULL);
295 return true;
296 }
297
298 /* clear merged. */
299 merged_chunk.set_size(sizeof(guint32));
300
301 /* merge the origin info and delta info */
302 guint32 system_total, user_total;
303 assert(system->get_total_freq(system_total));
304 assert(user->get_total_freq(user_total));
305 const guint32 merged_total = system_total + user_total;
306 merged_chunk.set_content(0, &merged_total, sizeof(guint32));
307
308 const SingleGramItem * cur_system = (const SingleGramItem *)
309 (((const char *)(system->m_chunk.begin())) + sizeof(guint32));
310 const SingleGramItem * system_end = (const SingleGramItem *)
311 system->m_chunk.end();
312
313 const SingleGramItem * cur_user = (const SingleGramItem *)
314 (((const char *)(user->m_chunk.begin())) + sizeof(guint32));
315 const SingleGramItem * user_end = (const SingleGramItem *)
316 user->m_chunk.end();
317
318 while (cur_system < system_end && cur_user < user_end) {
319
320 if (cur_system->m_token < cur_user->m_token) {
321 /* do append operation here */
322 merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
323 cur_system++;
324 } else if (cur_system->m_token > cur_user->m_token) {
325 /* do append operation here */
326 merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
327 cur_user++;
328 } else {
329 assert(cur_system->m_token == cur_user->m_token);
330
331 SingleGramItem merged_item;
332 merged_item.m_token = cur_system->m_token;
333 merged_item.m_freq = cur_system->m_freq + cur_user->m_freq;
334
335 merged_chunk.append_content(&merged_item, sizeof(SingleGramItem));
336 cur_system++; cur_user++;
337 }
338 }
339
340 /* add remained items. */
341 while (cur_system < system_end) {
342 merged_chunk.append_content(cur_system, sizeof(SingleGramItem));
343 cur_system++;
344 }
345
346 while (cur_user < user_end) {
347 merged_chunk.append_content(cur_user, sizeof(SingleGramItem));
348 cur_user++;
349 }
350
351 return true;
352 }
353
354 };
355