1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include <math.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <time.h>
22 #include "../include/mystdlib.h"
23 #include "../include/ngram.h"
24 
25 namespace ime_pinyin {
26 
27 #define ADD_COUNT 0.3
28 
comp_double(const void * p1,const void * p2)29 int comp_double(const void *p1, const void *p2) {
30   if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2))
31     return -1;
32   if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2))
33     return 1;
34   return 0;
35 }
36 
distance(double freq,double code)37 inline double distance(double freq, double code) {
38   // return fabs(freq - code);
39   return freq * fabs(log(freq) - log(code));
40 }
41 
42 // Find the index of the code value which is nearest to the given freq
qsearch_nearest(double code_book[],double freq,int start,int end)43 int qsearch_nearest(double code_book[], double freq, int start, int end) {
44   if (start == end)
45     return start;
46 
47   if (start + 1 == end) {
48     if (distance(freq, code_book[end]) > distance(freq, code_book[start]))
49       return start;
50     return end;
51   }
52 
53   int mid = (start + end) / 2;
54 
55   if (code_book[mid] > freq)
56     return qsearch_nearest(code_book, freq, start, mid);
57   else
58     return qsearch_nearest(code_book, freq, mid, end);
59 }
60 
update_code_idx(double freqs[],size_t num,double code_book[],CODEBOOK_TYPE * code_idx)61 size_t update_code_idx(double freqs[], size_t num, double code_book[],
62                        CODEBOOK_TYPE *code_idx) {
63   size_t changed = 0;
64   for (size_t pos = 0; pos < num; pos++) {
65     CODEBOOK_TYPE idx;
66     idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
67     if (idx != code_idx[pos])
68       changed++;
69     code_idx[pos] = idx;
70   }
71   return changed;
72 }
73 
recalculate_kernel(double freqs[],size_t num,double code_book[],CODEBOOK_TYPE * code_idx)74 double recalculate_kernel(double freqs[], size_t num, double code_book[],
75                           CODEBOOK_TYPE *code_idx) {
76   double ret = 0;
77 
78   size_t *item_num =  new size_t[kCodeBookSize];
79   assert(item_num);
80   memset(item_num, 0, sizeof(size_t) * kCodeBookSize);
81 
82   double *cb_new = new double[kCodeBookSize];
83   assert(cb_new);
84   memset(cb_new, 0, sizeof(double) * kCodeBookSize);
85 
86   for (size_t pos = 0; pos < num; pos++) {
87     ret += distance(freqs[pos], code_book[code_idx[pos]]);
88 
89     cb_new[code_idx[pos]] += freqs[pos];
90     item_num[code_idx[pos]] += 1;
91   }
92 
93   for (size_t code = 0; code < kCodeBookSize; code++) {
94     assert(item_num[code] > 0);
95     code_book[code] = cb_new[code] / item_num[code];
96   }
97 
98   delete [] item_num;
99   delete [] cb_new;
100 
101   return ret;
102 }
103 
iterate_codes(double freqs[],size_t num,double code_book[],CODEBOOK_TYPE * code_idx)104 void iterate_codes(double freqs[], size_t num, double code_book[],
105                    CODEBOOK_TYPE *code_idx) {
106   size_t iter_num = 0;
107   double delta_last = 0;
108   do {
109     size_t changed = update_code_idx(freqs, num, code_book, code_idx);
110 
111     double delta = recalculate_kernel(freqs, num, code_book, code_idx);
112 
113     if (kPrintDebug0) {
114       printf("---Unigram codebook iteration: %d : %d, %.9f\n",
115              iter_num, changed, delta);
116     }
117     iter_num++;
118 
119     if (iter_num > 1 &&
120         (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001))
121       break;
122     delta_last = delta;
123   } while (true);
124 }
125 
126 
127 NGram* NGram::instance_ = NULL;
128 
NGram()129 NGram::NGram() {
130   initialized_ = false;
131   idx_num_ = 0;
132   lma_freq_idx_ = NULL;
133   sys_score_compensation_ = 0;
134 
135 #ifdef ___BUILD_MODEL___
136   freq_codes_df_ = NULL;
137 #endif
138   freq_codes_ = NULL;
139 }
140 
~NGram()141 NGram::~NGram() {
142   if (NULL != lma_freq_idx_)
143     free(lma_freq_idx_);
144 
145 #ifdef ___BUILD_MODEL___
146   if (NULL != freq_codes_df_)
147     free(freq_codes_df_);
148 #endif
149 
150   if (NULL != freq_codes_)
151     free(freq_codes_);
152 }
153 
get_instance()154 NGram& NGram::get_instance() {
155   if (NULL == instance_)
156     instance_ = new NGram();
157   return *instance_;
158 }
159 
save_ngram(FILE * fp)160 bool NGram::save_ngram(FILE *fp) {
161   if (!initialized_ || NULL == fp)
162     return false;
163 
164   if (0 == idx_num_ || NULL == freq_codes_ ||  NULL == lma_freq_idx_)
165     return false;
166 
167   if (fwrite(&idx_num_, sizeof(uint32), 1, fp) != 1)
168     return false;
169 
170   if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
171       kCodeBookSize)
172     return false;
173 
174   if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
175     return false;
176 
177   return true;
178 }
179 
load_ngram(QFile * fp)180 bool NGram::load_ngram(QFile *fp) {
181   if (NULL == fp)
182     return false;
183 
184   initialized_ = false;
185 
186   if (fp->read((char *)&idx_num_, sizeof(uint32)) != sizeof(uint32) )
187     return false;
188 
189   if (NULL != lma_freq_idx_)
190     free(lma_freq_idx_);
191 
192   if (NULL != freq_codes_)
193     free(freq_codes_);
194 
195   lma_freq_idx_ = static_cast<CODEBOOK_TYPE*>
196                   (malloc(idx_num_ * sizeof(CODEBOOK_TYPE)));
197   freq_codes_ = static_cast<LmaScoreType*>
198       (malloc(kCodeBookSize * sizeof(LmaScoreType)));
199 
200   if (NULL == lma_freq_idx_ || NULL == freq_codes_)
201     return false;
202 
203   if (fp->read((char *)freq_codes_, sizeof(LmaScoreType) * kCodeBookSize) !=
204       sizeof(LmaScoreType) * kCodeBookSize)
205     return false;
206 
207   if (fp->read((char *)lma_freq_idx_, sizeof(CODEBOOK_TYPE) * idx_num_) != sizeof(CODEBOOK_TYPE) * idx_num_)
208     return false;
209 
210   initialized_ = true;
211 
212   total_freq_none_sys_ = 0;
213   return true;
214 }
215 
set_total_freq_none_sys(size_t freq_none_sys)216 void NGram::set_total_freq_none_sys(size_t freq_none_sys) {
217   total_freq_none_sys_ = freq_none_sys;
218   if (0 == total_freq_none_sys_) {
219     sys_score_compensation_ = 0;
220   } else {
221     double factor = static_cast<double>(kSysDictTotalFreq) / (
222         kSysDictTotalFreq + total_freq_none_sys_);
223     sys_score_compensation_ = static_cast<float>(
224         log(factor) * kLogValueAmplifier);
225   }
226 }
227 
228 // The caller makes sure this oject is initialized.
get_uni_psb(LemmaIdType lma_id)229 float NGram::get_uni_psb(LemmaIdType lma_id) {
230   return  static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) +
231       sys_score_compensation_;
232 }
233 
convert_psb_to_score(double psb)234 float NGram::convert_psb_to_score(double psb) {
235   float score = static_cast<float>(
236       log(psb) * static_cast<double>(kLogValueAmplifier));
237   if (score > static_cast<float>(kMaxScore)) {
238     score = static_cast<float>(kMaxScore);
239   }
240   return score;
241 }
242 
243 #ifdef ___BUILD_MODEL___
build_unigram(LemmaEntry * lemma_arr,size_t lemma_num,LemmaIdType next_idx_unused)244 bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
245                           LemmaIdType next_idx_unused) {
246   if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1)
247     return false;
248 
249   double total_freq = 0;
250   double *freqs = new double[next_idx_unused];
251   if (NULL == freqs)
252     return false;
253 
254   freqs[0] = ADD_COUNT;
255   total_freq += freqs[0];
256   LemmaIdType idx_now = 0;
257   for (size_t pos = 0; pos < lemma_num; pos++) {
258     if (lemma_arr[pos].idx_by_hz == idx_now)
259       continue;
260     idx_now++;
261 
262     assert(lemma_arr[pos].idx_by_hz == idx_now);
263 
264     freqs[idx_now] = lemma_arr[pos].freq;
265     if (freqs[idx_now] <= 0)
266       freqs[idx_now] = 0.3;
267 
268     total_freq += freqs[idx_now];
269   }
270 
271   double max_freq = 0;
272   idx_num_ = idx_now + 1;
273   assert(idx_now + 1 == next_idx_unused);
274 
275   for (size_t pos = 0; pos < idx_num_; pos++) {
276     freqs[pos] = freqs[pos] / total_freq;
277     assert(freqs[pos] > 0);
278     if (freqs[pos] > max_freq)
279       max_freq = freqs[pos];
280   }
281 
282   // calculate the code book
283   if (NULL == freq_codes_df_)
284     freq_codes_df_ = new double[kCodeBookSize];
285   assert(freq_codes_df_);
286   memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize);
287 
288   if (NULL == freq_codes_)
289     freq_codes_ = new LmaScoreType[kCodeBookSize];
290   assert(freq_codes_);
291   memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize);
292 
293   size_t freq_pos = 0;
294   for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
295     bool found = true;
296 
297     while (found) {
298       found = false;
299       double cand = freqs[freq_pos];
300       for (size_t i = 0; i < code_pos; i++)
301         if (freq_codes_df_[i] == cand) {
302           found = true;
303           break;
304         }
305       if (found)
306         freq_pos++;
307     }
308 
309     freq_codes_df_[code_pos] = freqs[freq_pos];
310     freq_pos++;
311   }
312 
313   myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
314 
315   if (NULL == lma_freq_idx_)
316     lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_];
317   assert(lma_freq_idx_);
318 
319   iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);
320 
321   delete [] freqs;
322 
323   if (kPrintDebug0) {
324     printf("\n------Language Model Unigram Codebook------\n");
325   }
326 
327   for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
328     double log_score = log(freq_codes_df_[code_pos]);
329     float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
330     if (kPrintDebug0) {
331       printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n",
332              code_pos, freq_codes_df_[code_pos], log_score, final_score);
333     }
334     freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
335   }
336 
337   initialized_ = true;
338   return true;
339 }
340 #endif
341 
342 }  // namespace ime_pinyin
343