1 // Copyright 2010-2018, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 #ifndef MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_ 31 #define MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_ 32 33 #include <functional> 34 #include <string> 35 #include <vector> 36 37 #include "base/util.h" 38 #include "converter/connector.h" 39 #include "converter/converter_interface.h" 40 #include "converter/immutable_converter_interface.h" 41 #include "converter/segmenter.h" 42 #include "converter/segments.h" 43 #include "data_manager/data_manager_interface.h" 44 #include "dictionary/dictionary_interface.h" 45 #include "dictionary/dictionary_token.h" 46 #include "dictionary/pos_matcher.h" 47 #include "prediction/predictor_interface.h" 48 #include "prediction/suggestion_filter.h" 49 #include "prediction/zero_query_dict.h" 50 #include "request/conversion_request.h" 51 // for FRIEND_TEST() 52 #include "testing/base/public/gunit_prod.h" 53 54 namespace mozc { 55 56 // Dictionary-based predictor 57 class DictionaryPredictor : public PredictorInterface { 58 public: 59 // Initializes a predictor with given references to submodules. Note that 60 // pointers are not owned by the class and to be deleted by the caller. 61 DictionaryPredictor(const DataManagerInterface& data_manager, 62 const ConverterInterface *converter, 63 const ImmutableConverterInterface *immutable_converter, 64 const dictionary::DictionaryInterface *dictionary, 65 const dictionary::DictionaryInterface *suffix_dictionary, 66 const Connector *connector, 67 const Segmenter *segmenter, 68 const dictionary::POSMatcher *pos_matcher, 69 const SuggestionFilter *suggestion_filter); 70 ~DictionaryPredictor() override; 71 72 bool PredictForRequest(const ConversionRequest &request, 73 Segments *segments) const override; 74 75 void Finish(const ConversionRequest &request, Segments *segments) override; 76 GetPredictorName()77 const string &GetPredictorName() const override { return predictor_name_; } 78 79 protected: 80 // Protected members for unittesting 81 // For use util method accessing private members, made them protected. 82 // https://github.com/google/googletest/blob/master/googletest/docs/FAQ.md 83 enum PredictionType { 84 // don't need to show any suggestions. 85 NO_PREDICTION = 0, 86 // suggests from current key user is now typing 87 UNIGRAM = 1, 88 // suggests from the previous history key user typed before. 89 BIGRAM = 2, 90 // suggests from immutable_converter 91 REALTIME = 4, 92 // add suffixes like "さん", "が" which matches to the pevious context. 93 SUFFIX = 8, 94 // add English words. 95 ENGLISH = 16, 96 // add prediciton to type corrected keys 97 TYPING_CORRECTION = 32, 98 99 // Suggests from |converter_|. The difference from REALTIME is that it uses 100 // the full converter with rewriter, history, etc. 101 // TODO(noriyukit): This label should be integrated with REALTIME. This is 102 // why 65536 is used to indicate that it is a temporary assignment. 103 REALTIME_TOP = 65536, 104 }; 105 // Bitfield to store a set of PredictionType. 106 typedef int32 PredictionTypes; 107 108 struct Result { ResultResult109 Result() : types(NO_PREDICTION), wcost(0), cost(0), lid(0), rid(0), 110 candidate_attributes(0), source_info(0), 111 consumed_key_size(0) {} 112 113 void InitializeByTokenAndTypes(const dictionary::Token &token, 114 PredictionTypes types); 115 void SetTypesAndTokenAttributes( 116 PredictionTypes prediction_types, 117 dictionary::Token::AttributesBitfield token_attr); 118 void SetSourceInfoForZeroQuery( 119 ZeroQueryType zero_query_type); 120 bool IsUserDictionaryResult() const; 121 122 string key; 123 string value; 124 // Indicating which PredictionType creates this instance. 125 // UNIGRAM, BIGRAM, REALTIME, SUFFIX, ENGLISH or TYPING_CORRECTION 126 // is set exclusively. 127 // TODO(matsuzakit): Using PredictionTypes both as input and output 128 // makes the code complex. Let's split them. 129 PredictionTypes types; 130 // Context "insensitive" candidate cost. 131 int wcost; 132 // Context "sensitive" candidate cost. 133 int cost; 134 int lid; 135 int rid; 136 // Boundary information for realtime conversion. 137 // This will be set only for realtime conversion result candidates. 138 // This contains inner segment size for key and value. 139 // If the candidate key and value are 140 // "わたしの|なまえは|なかのです", " 私の|名前は|中野です", 141 // |inner_segment_boundary| have [(4,2), (4, 3), (5, 4)]. 142 std::vector<uint32> inner_segment_boundary; 143 uint32 candidate_attributes; 144 // Segment::Candidate::SourceInfo. 145 // Will be used for usage stats. 146 uint32 source_info; 147 size_t consumed_key_size; 148 }; 149 150 // On MSVS2008/2010, Constructors of TestableDictionaryPredictor::Result 151 // causes a compile error even if you change the access right of it to public. 152 // You can use TestableDictionaryPredictor::MakeEmptyResult() instead. MakeEmptyResult()153 static Result MakeEmptyResult() { 154 return Result(); 155 } 156 157 class PredictiveLookupCallback; 158 class PredictiveBigramLookupCallback; 159 class ResultWCostLess; 160 class ResultCostLess; 161 162 void AggregateRealtimeConversion(PredictionTypes types, 163 const ConversionRequest &request, 164 Segments *segments, 165 std::vector<Result> *results) const; 166 167 void AggregateUnigramPrediction(PredictionTypes types, 168 const ConversionRequest &request, 169 const Segments &segments, 170 std::vector<Result> *results) const; 171 172 void AggregateBigramPrediction(PredictionTypes types, 173 const ConversionRequest &request, 174 const Segments &segments, 175 std::vector<Result> *results) const; 176 177 void AggregateSuffixPrediction(PredictionTypes types, 178 const ConversionRequest &request, 179 const Segments &segments, 180 std::vector<Result> *results) const; 181 182 void AggregateEnglishPrediction(PredictionTypes types, 183 const ConversionRequest &request, 184 const Segments &segments, 185 std::vector<Result> *results) const; 186 187 void AggregateTypeCorrectingPrediction(PredictionTypes types, 188 const ConversionRequest &request, 189 const Segments &segments, 190 std::vector<Result> *results) const; 191 192 void ApplyPenaltyForKeyExpansion(const Segments &segments, 193 std::vector<Result> *results) const; 194 195 bool AddPredictionToCandidates(const ConversionRequest &request, 196 Segments *segments, 197 std::vector<Result> *results) const; 198 199 private: 200 FRIEND_TEST(DictionaryPredictorTest, GetPredictionTypes); 201 FRIEND_TEST(DictionaryPredictorTest, 202 GetPredictionTypesTestWithTypingCorrection); 203 FRIEND_TEST(DictionaryPredictorTest, 204 GetPredictionTypesTestWithZeroQuerySuggestion); 205 FRIEND_TEST(DictionaryPredictorTest, IsZipCodeRequest); 206 FRIEND_TEST(DictionaryPredictorTest, GetRealtimeCandidateMaxSize); 207 FRIEND_TEST(DictionaryPredictorTest, GetRealtimeCandidateMaxSizeForMixed); 208 FRIEND_TEST(DictionaryPredictorTest, 209 GetRealtimeCandidateMaxSizeWithActualConverter); 210 FRIEND_TEST(DictionaryPredictorTest, GetCandidateCutoffThreshold); 211 FRIEND_TEST(DictionaryPredictorTest, AggregateUnigramPrediction); 212 FRIEND_TEST(DictionaryPredictorTest, AggregateBigramPrediction); 213 FRIEND_TEST(DictionaryPredictorTest, AggregateZeroQueryBigramPrediction); 214 FRIEND_TEST(DictionaryPredictorTest, AggregateSuffixPrediction); 215 FRIEND_TEST(DictionaryPredictorTest, AggregateZeroQuerySuffixPrediction); 216 FRIEND_TEST(DictionaryPredictorTest, 217 AggregateUnigramCandidateForMixedConversion); 218 FRIEND_TEST(DictionaryPredictorTest, ZeroQuerySuggestionAfterNumbers); 219 FRIEND_TEST(DictionaryPredictorTest, TriggerNumberZeroQuerySuggestion); 220 FRIEND_TEST(DictionaryPredictorTest, TriggerZeroQuerySuggestion); 221 FRIEND_TEST(DictionaryPredictorTest, GetHistoryKeyAndValue); 222 FRIEND_TEST(DictionaryPredictorTest, RealtimeConversionStartingWithAlphabets); 223 FRIEND_TEST(DictionaryPredictorTest, IsAggressiveSuggestion); 224 FRIEND_TEST(DictionaryPredictorTest, 225 RealtimeConversionWithSpellingCorrection); 226 FRIEND_TEST(DictionaryPredictorTest, GetMissSpelledPosition); 227 FRIEND_TEST(DictionaryPredictorTest, RemoveMissSpelledCandidates); 228 FRIEND_TEST(DictionaryPredictorTest, ConformCharacterWidthToPreference); 229 FRIEND_TEST(DictionaryPredictorTest, SetLMCost); 230 FRIEND_TEST(DictionaryPredictorTest, SetLMCostForUserDictionaryWord); 231 FRIEND_TEST(DictionaryPredictorTest, SetDescription); 232 FRIEND_TEST(DictionaryPredictorTest, SetDebugDescription); 233 FRIEND_TEST(DictionaryPredictorTest, GetZeroQueryCandidates); 234 235 typedef std::pair<string, ZeroQueryType> ZeroQueryResult; 236 237 // Looks up the given range and appends zero query candidate list for |key| 238 // to |results|. 239 // Returns false if there is no result for |key|. 240 static bool GetZeroQueryCandidatesForKey( 241 const ConversionRequest &request, 242 const string &key, 243 const ZeroQueryDict &dict, 244 std::vector<ZeroQueryResult> *results); 245 246 static void AppendZeroQueryToResults( 247 const std::vector<ZeroQueryResult> &candidates, 248 uint16 lid, uint16 rid, std::vector<Result> *results); 249 250 // Returns false if no results were aggregated. 251 bool AggregatePrediction(const ConversionRequest &request, 252 Segments *segments, 253 std::vector<Result> *results) const; 254 255 bool AggregateNumberZeroQueryPrediction(const ConversionRequest &request, 256 const Segments &segments, 257 std::vector<Result> *results) const; 258 259 bool AggregateZeroQueryPrediction(const ConversionRequest &request, 260 const Segments &segments, 261 std::vector<Result> *result) const; 262 263 void SetCost(const ConversionRequest &request, 264 const Segments &segments, std::vector<Result> *results) const; 265 266 // Removes prediciton by setting NO_PREDICTION to result type if necessary. 267 void RemovePrediction(const ConversionRequest &request, 268 const Segments &segments, 269 std::vector<Result> *results) const; 270 271 // Adds prediction results from history key and value. 272 void AddBigramResultsFromHistory(const string &history_key, 273 const string &history_value, 274 const ConversionRequest &request, 275 const Segments &segments, 276 std::vector<Result> *results) const; 277 278 // Changes the prediction type for irrelevant bigram candidate. 279 void CheckBigramResult(const dictionary::Token &history_token, 280 const Util::ScriptType history_ctype, 281 const Util::ScriptType last_history_ctype, 282 const ConversionRequest &request, 283 Result *result) const; 284 285 static void GetPredictiveResults( 286 const dictionary::DictionaryInterface &dictionary, 287 const string &history_key, 288 const ConversionRequest &request, 289 const Segments &segments, 290 PredictionTypes types, 291 size_t lookup_limit, 292 std::vector<Result> *results); 293 294 void GetPredictiveResultsForBigram( 295 const dictionary::DictionaryInterface &dictionary, 296 const string &history_key, 297 const string &history_value, 298 const ConversionRequest &request, 299 const Segments &segments, 300 PredictionTypes types, 301 size_t lookup_limit, 302 std::vector<Result> *results) const; 303 304 // Performs a custom look up for English words where case-conversion might be 305 // applied to lookup key and/or output results. 306 void GetPredictiveResultsForEnglish( 307 const dictionary::DictionaryInterface &dictionary, 308 const string &history_key, 309 const ConversionRequest &request, 310 const Segments &segments, 311 PredictionTypes types, 312 size_t lookup_limit, 313 std::vector<Result> *results) const; 314 315 // Performs look-ups using type-corrected queries from composer. Usually 316 // involves multiple look-ups from dictionary. 317 void GetPredictiveResultsUsingTypingCorrection( 318 const dictionary::DictionaryInterface &dictionary, 319 const string &history_key, 320 const ConversionRequest &request, 321 const Segments &segments, 322 PredictionTypes types, 323 size_t lookup_limit, 324 std::vector<Result> *results) const; 325 326 // Returns the position of misspelled character position. 327 // 328 // Example1: 329 // key: "れみおめろん" 330 // value: "レミオロメン" 331 // returns 3 332 // 333 // Example3: 334 // key: "ろっぽんぎ"5 335 // value: "六本木" 336 // returns 5 (charslen("六本木")) 337 size_t GetMissSpelledPosition(const string &key, 338 const string &value) const; 339 340 // Returns language model cost of |token| given prediciton type |type|. 341 // |rid| is the right id of previous word (token). 342 // If |rid| is uknown, set 0 as a default value. 343 int GetLMCost(const Result &result, int rid) const; 344 345 // Given the results aggregated by aggregates, remove 346 // miss-spelled results from the |results|. 347 // we don't directly remove miss-spelled result but set 348 // result[i].type = NO_PREDICTION. 349 // 350 // Here's the basic step of removal: 351 // Case1: 352 // result1: "あぼがど" => "アボガド" 353 // result2: "あぼがど" => "アボカド" (spelling correction) 354 // result3: "あぼかど" => "アボカド" 355 // In this case, we can remove result 1 and 2. 356 // If there exists the same result2.key in result1,3 and 357 // the same result2.value in result1,3, we can remove the 358 // 1) spelling correction candidate 2) candidate having 359 // the same key as the spelling correction candidate. 360 // 361 // Case2: 362 // result1: "あぼかど" => "アボカド" 363 // result2: "あぼがど" => "アボカド" (spelling correction) 364 // In this case, remove result2. 365 // 366 // Case3: 367 // result1: "あぼがど" => "アボガド" 368 // result2: "あぼがど" => "アボカド" (spelling correction) 369 // In this case, 370 // a) user input: あ,あぼ,あぼ => remove result1, result2 371 // b) user input: あぼが,あぼがど => remove result1 372 // 373 // let |same_key_size| and |same_value_size| be the number of 374 // non-spelling-correction-candidates who have the same key/value as 375 // spelling-correction-candidate respectively. 376 // 377 // if (same_key_size > 0 && same_value_size > 0) { 378 // remove spelling correction and candidates having the 379 // same key as the spelling correction. 380 // } else if (same_key_size == 0 && same_value_size > 0) { 381 // remove spelling correction 382 // } else { 383 // do nothing. 384 // } 385 void RemoveMissSpelledCandidates(size_t request_key_len, 386 std::vector<Result> *results) const; 387 388 // Scoring function which takes prediction bounus into account. 389 // It basically reranks the candidate by lang_prob * (1 + remain_len). 390 // This algorithm is mainly used for desktop. 391 void SetPredictionCost(const Segments &segments, 392 std::vector<Result> *results) const; 393 394 // Language model-based scoring function. 395 // This algorithm is mainly used for mobile. 396 void SetLMCost(const Segments &segments, 397 std::vector<Result> *results) const; 398 399 // Returns true if the suggestion is classified 400 // as "aggressive". 401 bool IsAggressiveSuggestion( 402 size_t query_len, size_t key_len, int cost, 403 bool is_suggestion, size_t total_candidates_size) const; 404 405 // Gets history key/value. 406 // Returns false if history segments are 407 // not found. 408 bool GetHistoryKeyAndValue(const Segments &segments, 409 string *key, string *value) const; 410 411 // Returns a bitfield of PredictionType. 412 static PredictionTypes GetPredictionTypes(const ConversionRequest &request, 413 const Segments &segments); 414 415 // Returns true if the realtime conversion should be used. 416 // TODO(hidehiko): add Config and Request instances into the arguments 417 // to represent the dependency explicitly. 418 static bool ShouldRealTimeConversionEnabled(const ConversionRequest &request, 419 const Segments &segments); 420 421 // Returns true if key consistes of '0'-'9' or '-' 422 static bool IsZipCodeRequest(const string &key); 423 424 // Returns max size of realtime candidates. 425 size_t GetRealtimeCandidateMaxSize(const Segments &segments, 426 bool mixed_conversion, 427 size_t max_size) const; 428 429 // Aggregates unigram candidate for non mixed conversion. 430 void AggregateUnigramCandidate(const ConversionRequest &request, 431 const Segments &segments, 432 std::vector<Result> *results) const; 433 434 // Aggregates unigram candidate for mixed conversion. 435 // This reduces redundant candidates. 436 static void AggregateUnigramCandidateForMixedConversion( 437 const dictionary::DictionaryInterface &dictionary, 438 const ConversionRequest &request, 439 const Segments &segments, 440 std::vector<Result> *results); 441 442 // The same as the static version of this method above but uses |dictionary_|. 443 void AggregateUnigramCandidateForMixedConversion( 444 const ConversionRequest &request, 445 const Segments &segments, 446 std::vector<Result> *results) const; 447 448 // Returns cutoff threshold of unigram candidates. 449 // AggregateUnigramPrediction method does not return any candidates 450 // if there are too many (>= cutoff threshold) eligible candidates. 451 // This behavior prevents a user from seeing too many prefix-match 452 // candidates. 453 size_t GetCandidateCutoffThreshold(const Segments &segments) const; 454 455 // Generates a top conversion result from |converter_| and adds its result to 456 // |results|. 457 bool PushBackTopConversionResult(const ConversionRequest &request, 458 const Segments &segments, 459 std::vector<Result> *results) const; 460 461 void MaybeRecordUsageStats(const Segment::Candidate &candidate) const; 462 463 // Sets candidate description. 464 static void SetDescription(PredictionTypes types, 465 uint32 attributes, 466 string *description); 467 // Description for DEBUG mode. 468 static void SetDebugDescription(PredictionTypes types, 469 string *description); 470 471 const ConverterInterface *converter_; 472 const ImmutableConverterInterface *immutable_converter_; 473 const dictionary::DictionaryInterface *dictionary_; 474 const dictionary::DictionaryInterface *suffix_dictionary_; 475 const Connector *connector_; 476 const Segmenter *segmenter_; 477 const SuggestionFilter *suggestion_filter_; 478 const uint16 counter_suffix_word_id_; 479 const uint16 general_symbol_id_; 480 const string predictor_name_; 481 ZeroQueryDict zero_query_dict_; 482 ZeroQueryDict zero_query_number_dict_; 483 484 DISALLOW_COPY_AND_ASSIGN(DictionaryPredictor); 485 }; 486 487 } // namespace mozc 488 489 #endif // MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_ 490