1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #ifndef MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_
31 #define MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_
32 
33 #include <functional>
34 #include <string>
35 #include <vector>
36 
37 #include "base/util.h"
38 #include "converter/connector.h"
39 #include "converter/converter_interface.h"
40 #include "converter/immutable_converter_interface.h"
41 #include "converter/segmenter.h"
42 #include "converter/segments.h"
43 #include "data_manager/data_manager_interface.h"
44 #include "dictionary/dictionary_interface.h"
45 #include "dictionary/dictionary_token.h"
46 #include "dictionary/pos_matcher.h"
47 #include "prediction/predictor_interface.h"
48 #include "prediction/suggestion_filter.h"
49 #include "prediction/zero_query_dict.h"
50 #include "request/conversion_request.h"
51 // for FRIEND_TEST()
52 #include "testing/base/public/gunit_prod.h"
53 
54 namespace mozc {
55 
56 // Dictionary-based predictor
57 class DictionaryPredictor : public PredictorInterface {
58  public:
59   // Initializes a predictor with given references to submodules. Note that
60   // pointers are not owned by the class and to be deleted by the caller.
61   DictionaryPredictor(const DataManagerInterface& data_manager,
62                       const ConverterInterface *converter,
63                       const ImmutableConverterInterface *immutable_converter,
64                       const dictionary::DictionaryInterface *dictionary,
65                       const dictionary::DictionaryInterface *suffix_dictionary,
66                       const Connector *connector,
67                       const Segmenter *segmenter,
68                       const dictionary::POSMatcher *pos_matcher,
69                       const SuggestionFilter *suggestion_filter);
70   ~DictionaryPredictor() override;
71 
72   bool PredictForRequest(const ConversionRequest &request,
73                          Segments *segments) const override;
74 
75   void Finish(const ConversionRequest &request, Segments *segments) override;
76 
GetPredictorName()77   const string &GetPredictorName() const override { return predictor_name_; }
78 
79  protected:
80   // Protected members for unittesting
81   // For use util method accessing private members, made them protected.
82   // https://github.com/google/googletest/blob/master/googletest/docs/FAQ.md
83   enum PredictionType {
84     // don't need to show any suggestions.
85     NO_PREDICTION = 0,
86     // suggests from current key user is now typing
87     UNIGRAM = 1,
88     // suggests from the previous history key user typed before.
89     BIGRAM = 2,
90     // suggests from immutable_converter
91     REALTIME = 4,
92     // add suffixes like "さん", "が" which matches to the pevious context.
93     SUFFIX =  8,
94     // add English words.
95     ENGLISH = 16,
96     // add prediciton to type corrected keys
97     TYPING_CORRECTION = 32,
98 
99     // Suggests from |converter_|. The difference from REALTIME is that it uses
100     // the full converter with rewriter, history, etc.
101     // TODO(noriyukit): This label should be integrated with REALTIME. This is
102     // why 65536 is used to indicate that it is a temporary assignment.
103     REALTIME_TOP = 65536,
104   };
105   // Bitfield to store a set of PredictionType.
106   typedef int32 PredictionTypes;
107 
108   struct Result {
ResultResult109     Result() : types(NO_PREDICTION), wcost(0), cost(0), lid(0), rid(0),
110                candidate_attributes(0), source_info(0),
111                consumed_key_size(0) {}
112 
113     void InitializeByTokenAndTypes(const dictionary::Token &token,
114                                    PredictionTypes types);
115     void SetTypesAndTokenAttributes(
116         PredictionTypes prediction_types,
117         dictionary::Token::AttributesBitfield token_attr);
118     void SetSourceInfoForZeroQuery(
119         ZeroQueryType zero_query_type);
120     bool IsUserDictionaryResult() const;
121 
122     string key;
123     string value;
124     // Indicating which PredictionType creates this instance.
125     // UNIGRAM, BIGRAM, REALTIME, SUFFIX, ENGLISH or TYPING_CORRECTION
126     // is set exclusively.
127     // TODO(matsuzakit): Using PredictionTypes both as input and output
128     //                   makes the code complex. Let's split them.
129     PredictionTypes types;
130     // Context "insensitive" candidate cost.
131     int wcost;
132     // Context "sensitive" candidate cost.
133     int cost;
134     int lid;
135     int rid;
136     // Boundary information for realtime conversion.
137     // This will be set only for realtime conversion result candidates.
138     // This contains inner segment size for key and value.
139     // If the candidate key and value are
140     // "わたしの|なまえは|なかのです", " 私の|名前は|中野です",
141     // |inner_segment_boundary| have [(4,2), (4, 3), (5, 4)].
142     std::vector<uint32> inner_segment_boundary;
143     uint32 candidate_attributes;
144     // Segment::Candidate::SourceInfo.
145     // Will be used for usage stats.
146     uint32 source_info;
147     size_t consumed_key_size;
148   };
149 
150   // On MSVS2008/2010, Constructors of TestableDictionaryPredictor::Result
151   // causes a compile error even if you change the access right of it to public.
152   // You can use TestableDictionaryPredictor::MakeEmptyResult() instead.
MakeEmptyResult()153   static Result MakeEmptyResult() {
154     return Result();
155   }
156 
157   class PredictiveLookupCallback;
158   class PredictiveBigramLookupCallback;
159   class ResultWCostLess;
160   class ResultCostLess;
161 
162   void AggregateRealtimeConversion(PredictionTypes types,
163                                    const ConversionRequest &request,
164                                    Segments *segments,
165                                    std::vector<Result> *results) const;
166 
167   void AggregateUnigramPrediction(PredictionTypes types,
168                                   const ConversionRequest &request,
169                                   const Segments &segments,
170                                   std::vector<Result> *results) const;
171 
172   void AggregateBigramPrediction(PredictionTypes types,
173                                  const ConversionRequest &request,
174                                  const Segments &segments,
175                                  std::vector<Result> *results) const;
176 
177   void AggregateSuffixPrediction(PredictionTypes types,
178                                  const ConversionRequest &request,
179                                  const Segments &segments,
180                                  std::vector<Result> *results) const;
181 
182   void AggregateEnglishPrediction(PredictionTypes types,
183                                   const ConversionRequest &request,
184                                   const Segments &segments,
185                                   std::vector<Result> *results) const;
186 
187   void AggregateTypeCorrectingPrediction(PredictionTypes types,
188                                          const ConversionRequest &request,
189                                          const Segments &segments,
190                                          std::vector<Result> *results) const;
191 
192   void ApplyPenaltyForKeyExpansion(const Segments &segments,
193                                    std::vector<Result> *results) const;
194 
195   bool AddPredictionToCandidates(const ConversionRequest &request,
196                                  Segments *segments,
197                                  std::vector<Result> *results) const;
198 
199  private:
200   FRIEND_TEST(DictionaryPredictorTest, GetPredictionTypes);
201   FRIEND_TEST(DictionaryPredictorTest,
202               GetPredictionTypesTestWithTypingCorrection);
203   FRIEND_TEST(DictionaryPredictorTest,
204               GetPredictionTypesTestWithZeroQuerySuggestion);
205   FRIEND_TEST(DictionaryPredictorTest, IsZipCodeRequest);
206   FRIEND_TEST(DictionaryPredictorTest, GetRealtimeCandidateMaxSize);
207   FRIEND_TEST(DictionaryPredictorTest, GetRealtimeCandidateMaxSizeForMixed);
208   FRIEND_TEST(DictionaryPredictorTest,
209               GetRealtimeCandidateMaxSizeWithActualConverter);
210   FRIEND_TEST(DictionaryPredictorTest, GetCandidateCutoffThreshold);
211   FRIEND_TEST(DictionaryPredictorTest, AggregateUnigramPrediction);
212   FRIEND_TEST(DictionaryPredictorTest, AggregateBigramPrediction);
213   FRIEND_TEST(DictionaryPredictorTest, AggregateZeroQueryBigramPrediction);
214   FRIEND_TEST(DictionaryPredictorTest, AggregateSuffixPrediction);
215   FRIEND_TEST(DictionaryPredictorTest, AggregateZeroQuerySuffixPrediction);
216   FRIEND_TEST(DictionaryPredictorTest,
217               AggregateUnigramCandidateForMixedConversion);
218   FRIEND_TEST(DictionaryPredictorTest, ZeroQuerySuggestionAfterNumbers);
219   FRIEND_TEST(DictionaryPredictorTest, TriggerNumberZeroQuerySuggestion);
220   FRIEND_TEST(DictionaryPredictorTest, TriggerZeroQuerySuggestion);
221   FRIEND_TEST(DictionaryPredictorTest, GetHistoryKeyAndValue);
222   FRIEND_TEST(DictionaryPredictorTest, RealtimeConversionStartingWithAlphabets);
223   FRIEND_TEST(DictionaryPredictorTest, IsAggressiveSuggestion);
224   FRIEND_TEST(DictionaryPredictorTest,
225               RealtimeConversionWithSpellingCorrection);
226   FRIEND_TEST(DictionaryPredictorTest, GetMissSpelledPosition);
227   FRIEND_TEST(DictionaryPredictorTest, RemoveMissSpelledCandidates);
228   FRIEND_TEST(DictionaryPredictorTest, ConformCharacterWidthToPreference);
229   FRIEND_TEST(DictionaryPredictorTest, SetLMCost);
230   FRIEND_TEST(DictionaryPredictorTest, SetLMCostForUserDictionaryWord);
231   FRIEND_TEST(DictionaryPredictorTest, SetDescription);
232   FRIEND_TEST(DictionaryPredictorTest, SetDebugDescription);
233   FRIEND_TEST(DictionaryPredictorTest, GetZeroQueryCandidates);
234 
235   typedef std::pair<string, ZeroQueryType> ZeroQueryResult;
236 
237   // Looks up the given range and appends zero query candidate list for |key|
238   // to |results|.
239   // Returns false if there is no result for |key|.
240   static bool GetZeroQueryCandidatesForKey(
241       const ConversionRequest &request,
242       const string &key,
243       const ZeroQueryDict &dict,
244       std::vector<ZeroQueryResult> *results);
245 
246   static void AppendZeroQueryToResults(
247       const std::vector<ZeroQueryResult> &candidates,
248       uint16 lid, uint16 rid, std::vector<Result> *results);
249 
250   // Returns false if no results were aggregated.
251   bool AggregatePrediction(const ConversionRequest &request,
252                            Segments *segments,
253                            std::vector<Result> *results) const;
254 
255   bool AggregateNumberZeroQueryPrediction(const ConversionRequest &request,
256                                           const Segments &segments,
257                                           std::vector<Result> *results) const;
258 
259   bool AggregateZeroQueryPrediction(const ConversionRequest &request,
260                                     const Segments &segments,
261                                     std::vector<Result> *result) const;
262 
263   void SetCost(const ConversionRequest &request,
264                const Segments &segments, std::vector<Result> *results) const;
265 
266   // Removes prediciton by setting NO_PREDICTION to result type if necessary.
267   void RemovePrediction(const ConversionRequest &request,
268                         const Segments &segments,
269                         std::vector<Result> *results) const;
270 
271   // Adds prediction results from history key and value.
272   void AddBigramResultsFromHistory(const string &history_key,
273                                    const string &history_value,
274                                    const ConversionRequest &request,
275                                    const Segments &segments,
276                                    std::vector<Result> *results) const;
277 
278   // Changes the prediction type for irrelevant bigram candidate.
279   void CheckBigramResult(const dictionary::Token &history_token,
280                          const Util::ScriptType history_ctype,
281                          const Util::ScriptType last_history_ctype,
282                          const ConversionRequest &request,
283                          Result *result) const;
284 
285   static void GetPredictiveResults(
286       const dictionary::DictionaryInterface &dictionary,
287       const string &history_key,
288       const ConversionRequest &request,
289       const Segments &segments,
290       PredictionTypes types,
291       size_t lookup_limit,
292       std::vector<Result> *results);
293 
294   void GetPredictiveResultsForBigram(
295       const dictionary::DictionaryInterface &dictionary,
296       const string &history_key,
297       const string &history_value,
298       const ConversionRequest &request,
299       const Segments &segments,
300       PredictionTypes types,
301       size_t lookup_limit,
302       std::vector<Result> *results) const;
303 
304   // Performs a custom look up for English words where case-conversion might be
305   // applied to lookup key and/or output results.
306   void GetPredictiveResultsForEnglish(
307       const dictionary::DictionaryInterface &dictionary,
308       const string &history_key,
309       const ConversionRequest &request,
310       const Segments &segments,
311       PredictionTypes types,
312       size_t lookup_limit,
313       std::vector<Result> *results) const;
314 
315   // Performs look-ups using type-corrected queries from composer. Usually
316   // involves multiple look-ups from dictionary.
317   void GetPredictiveResultsUsingTypingCorrection(
318       const dictionary::DictionaryInterface &dictionary,
319       const string &history_key,
320       const ConversionRequest &request,
321       const Segments &segments,
322       PredictionTypes types,
323       size_t lookup_limit,
324       std::vector<Result> *results) const;
325 
326   // Returns the position of misspelled character position.
327   //
328   // Example1:
329   // key: "れみおめろん"
330   // value: "レミオロメン"
331   // returns 3
332   //
333   // Example3:
334   // key: "ろっぽんぎ"5
335   // value: "六本木"
336   // returns 5 (charslen("六本木"))
337   size_t GetMissSpelledPosition(const string &key,
338                                 const string &value) const;
339 
340   // Returns language model cost of |token| given prediciton type |type|.
341   // |rid| is the right id of previous word (token).
342   // If |rid| is uknown, set 0 as a default value.
343   int GetLMCost(const Result &result, int rid) const;
344 
345   // Given the results aggregated by aggregates, remove
346   // miss-spelled results from the |results|.
347   // we don't directly remove miss-spelled result but set
348   // result[i].type = NO_PREDICTION.
349   //
350   // Here's the basic step of removal:
351   // Case1:
352   // result1: "あぼがど" => "アボガド"
353   // result2: "あぼがど" => "アボカド" (spelling correction)
354   // result3: "あぼかど" => "アボカド"
355   // In this case, we can remove result 1 and 2.
356   // If there exists the same result2.key in result1,3 and
357   // the same result2.value in result1,3, we can remove the
358   // 1) spelling correction candidate 2) candidate having
359   // the same key as the spelling correction candidate.
360   //
361   // Case2:
362   // result1: "あぼかど" => "アボカド"
363   // result2: "あぼがど" => "アボカド" (spelling correction)
364   // In this case, remove result2.
365   //
366   // Case3:
367   // result1: "あぼがど" => "アボガド"
368   // result2: "あぼがど" => "アボカド" (spelling correction)
369   // In this case,
370   //   a) user input: あ,あぼ,あぼ => remove result1, result2
371   //   b) user input: あぼが,あぼがど  => remove result1
372   //
373   // let |same_key_size| and |same_value_size| be the number of
374   // non-spelling-correction-candidates who have the same key/value as
375   // spelling-correction-candidate respectively.
376   //
377   // if (same_key_size > 0 && same_value_size > 0) {
378   //   remove spelling correction and candidates having the
379   //   same key as the spelling correction.
380   // } else if (same_key_size == 0 && same_value_size > 0) {
381   //   remove spelling correction
382   // } else {
383   //   do nothing.
384   // }
385   void RemoveMissSpelledCandidates(size_t request_key_len,
386                                    std::vector<Result> *results) const;
387 
388   // Scoring function which takes prediction bounus into account.
389   // It basically reranks the candidate by lang_prob * (1 + remain_len).
390   // This algorithm is mainly used for desktop.
391   void SetPredictionCost(const Segments &segments,
392                          std::vector<Result> *results) const;
393 
394   // Language model-based scoring function.
395   // This algorithm is mainly used for mobile.
396   void SetLMCost(const Segments &segments,
397                  std::vector<Result> *results) const;
398 
399   // Returns true if the suggestion is classified
400   // as "aggressive".
401   bool IsAggressiveSuggestion(
402       size_t query_len, size_t key_len, int cost,
403       bool is_suggestion, size_t total_candidates_size) const;
404 
405   // Gets history key/value.
406   // Returns false if history segments are
407   // not found.
408   bool GetHistoryKeyAndValue(const Segments &segments,
409                              string *key, string *value) const;
410 
411   // Returns a bitfield of PredictionType.
412   static PredictionTypes GetPredictionTypes(const ConversionRequest &request,
413                                             const Segments &segments);
414 
415   // Returns true if the realtime conversion should be used.
416   // TODO(hidehiko): add Config and Request instances into the arguments
417   //   to represent the dependency explicitly.
418   static bool ShouldRealTimeConversionEnabled(const ConversionRequest &request,
419                                               const Segments &segments);
420 
421   // Returns true if key consistes of '0'-'9' or '-'
422   static bool IsZipCodeRequest(const string &key);
423 
424   // Returns max size of realtime candidates.
425   size_t GetRealtimeCandidateMaxSize(const Segments &segments,
426                                      bool mixed_conversion,
427                                      size_t max_size) const;
428 
429   // Aggregates unigram candidate for non mixed conversion.
430   void AggregateUnigramCandidate(const ConversionRequest &request,
431                                  const Segments &segments,
432                                  std::vector<Result> *results) const;
433 
434   // Aggregates unigram candidate for mixed conversion.
435   // This reduces redundant candidates.
436   static void AggregateUnigramCandidateForMixedConversion(
437       const dictionary::DictionaryInterface &dictionary,
438       const ConversionRequest &request,
439       const Segments &segments,
440       std::vector<Result> *results);
441 
442   // The same as the static version of this method above but uses |dictionary_|.
443   void AggregateUnigramCandidateForMixedConversion(
444       const ConversionRequest &request,
445       const Segments &segments,
446       std::vector<Result> *results) const;
447 
448   // Returns cutoff threshold of unigram candidates.
449   // AggregateUnigramPrediction method does not return any candidates
450   // if there are too many (>= cutoff threshold) eligible candidates.
451   // This behavior prevents a user from seeing too many prefix-match
452   // candidates.
453   size_t GetCandidateCutoffThreshold(const Segments &segments) const;
454 
455   // Generates a top conversion result from |converter_| and adds its result to
456   // |results|.
457   bool PushBackTopConversionResult(const ConversionRequest &request,
458                                    const Segments &segments,
459                                    std::vector<Result> *results) const;
460 
461   void MaybeRecordUsageStats(const Segment::Candidate &candidate) const;
462 
463   // Sets candidate description.
464   static void SetDescription(PredictionTypes types,
465                              uint32 attributes,
466                              string *description);
467   // Description for DEBUG mode.
468   static void SetDebugDescription(PredictionTypes types,
469                                   string *description);
470 
471   const ConverterInterface *converter_;
472   const ImmutableConverterInterface *immutable_converter_;
473   const dictionary::DictionaryInterface *dictionary_;
474   const dictionary::DictionaryInterface *suffix_dictionary_;
475   const Connector *connector_;
476   const Segmenter *segmenter_;
477   const SuggestionFilter *suggestion_filter_;
478   const uint16 counter_suffix_word_id_;
479   const uint16 general_symbol_id_;
480   const string predictor_name_;
481   ZeroQueryDict zero_query_dict_;
482   ZeroQueryDict zero_query_number_dict_;
483 
484   DISALLOW_COPY_AND_ASSIGN(DictionaryPredictor);
485 };
486 
487 }  // namespace mozc
488 
489 #endif  // MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_
490