1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef PINYINIME_INCLUDE_NGRAM_H__
18 #define PINYINIME_INCLUDE_NGRAM_H__
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include "./dictdef.h"
23 #include <QFile>
24 
25 namespace ime_pinyin {
26 
27 typedef unsigned char CODEBOOK_TYPE;
28 
29 static const size_t kCodeBookSize = 256;
30 
31 class NGram {
32  public:
33   // The maximum score of a lemma item.
34   static const LmaScoreType kMaxScore = 0x3fff;
35 
36   // In order to reduce the storage size, the original log value is amplified by
37   // kScoreAmplifier, and we use LmaScoreType to store.
38   // After this process, an item with a lower score has a higher frequency.
39   static const int kLogValueAmplifier = -800;
40 
41   // System words' total frequency. It is not the real total frequency, instead,
42   // It is only used to adjust system lemmas' scores when the user dictionary's
43   // total frequency changes.
44   // In this version, frequencies of system lemmas are fixed. We are considering
45   // to make them changable in next version.
46   static const size_t kSysDictTotalFreq = 100000000;
47 
48  private:
49 
50   static NGram* instance_;
51 
52   bool initialized_;
53   uint32 idx_num_;
54 
55   size_t total_freq_none_sys_;
56 
57   // Score compensation for system dictionary lemmas.
58   // Because after user adds some user lemmas, the total frequency changes, and
59   // we use this value to normalize the score.
60   float sys_score_compensation_;
61 
62 #ifdef ___BUILD_MODEL___
63   double *freq_codes_df_;
64 #endif
65   LmaScoreType *freq_codes_;
66   CODEBOOK_TYPE *lma_freq_idx_;
67 
68  public:
69   NGram();
70   ~NGram();
71 
72   static NGram& get_instance();
73 
74   bool save_ngram(FILE *fp);
75   bool load_ngram(QFile *fp);
76 
77   // Set the total frequency of all none system dictionaries.
78   void set_total_freq_none_sys(size_t freq_none_sys);
79 
80   float get_uni_psb(LemmaIdType lma_id);
81 
82   // Convert a probability to score. Actually, the score will be limited to
83   // kMaxScore, but at runtime, we also need float expression to get accurate
84   // value of the score.
85   // After the conversion, a lower score indicates a higher probability of the
86   // item.
87   static float convert_psb_to_score(double psb);
88 
89 #ifdef ___BUILD_MODEL___
90   // For constructing the unigram mode model.
91   bool build_unigram(LemmaEntry *lemma_arr, size_t num,
92                      LemmaIdType next_idx_unused);
93 #endif
94 };
95 }
96 
97 #endif  // PINYINIME_INCLUDE_NGRAM_H__
98