1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_NGRAM_H__ 18 #define PINYINIME_INCLUDE_NGRAM_H__ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include "./dictdef.h" 23 #include <QFile> 24 25 namespace ime_pinyin { 26 27 typedef unsigned char CODEBOOK_TYPE; 28 29 static const size_t kCodeBookSize = 256; 30 31 class NGram { 32 public: 33 // The maximum score of a lemma item. 34 static const LmaScoreType kMaxScore = 0x3fff; 35 36 // In order to reduce the storage size, the original log value is amplified by 37 // kScoreAmplifier, and we use LmaScoreType to store. 38 // After this process, an item with a lower score has a higher frequency. 39 static const int kLogValueAmplifier = -800; 40 41 // System words' total frequency. It is not the real total frequency, instead, 42 // It is only used to adjust system lemmas' scores when the user dictionary's 43 // total frequency changes. 44 // In this version, frequencies of system lemmas are fixed. We are considering 45 // to make them changable in next version. 46 static const size_t kSysDictTotalFreq = 100000000; 47 48 private: 49 50 static NGram* instance_; 51 52 bool initialized_; 53 uint32 idx_num_; 54 55 size_t total_freq_none_sys_; 56 57 // Score compensation for system dictionary lemmas. 58 // Because after user adds some user lemmas, the total frequency changes, and 59 // we use this value to normalize the score. 60 float sys_score_compensation_; 61 62 #ifdef ___BUILD_MODEL___ 63 double *freq_codes_df_; 64 #endif 65 LmaScoreType *freq_codes_; 66 CODEBOOK_TYPE *lma_freq_idx_; 67 68 public: 69 NGram(); 70 ~NGram(); 71 72 static NGram& get_instance(); 73 74 bool save_ngram(FILE *fp); 75 bool load_ngram(QFile *fp); 76 77 // Set the total frequency of all none system dictionaries. 78 void set_total_freq_none_sys(size_t freq_none_sys); 79 80 float get_uni_psb(LemmaIdType lma_id); 81 82 // Convert a probability to score. Actually, the score will be limited to 83 // kMaxScore, but at runtime, we also need float expression to get accurate 84 // value of the score. 85 // After the conversion, a lower score indicates a higher probability of the 86 // item. 87 static float convert_psb_to_score(double psb); 88 89 #ifdef ___BUILD_MODEL___ 90 // For constructing the unigram mode model. 91 bool build_unigram(LemmaEntry *lemma_arr, size_t num, 92 LemmaIdType next_idx_unused); 93 #endif 94 }; 95 } 96 97 #endif // PINYINIME_INCLUDE_NGRAM_H__ 98