1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #ifndef MOZC_REWRITER_EMOJI_REWRITER_H_
31 #define MOZC_REWRITER_EMOJI_REWRITER_H_
32 
33 #include <cstddef>
34 #include <iterator>
35 #include <utility>
36 
37 #include "base/serialized_string_array.h"
38 #include "base/string_piece.h"
39 #include "converter/segments.h"
40 #include "data_manager/data_manager_interface.h"
41 #include "rewriter/rewriter_interface.h"
42 
43 namespace mozc {
44 
45 class ConversionRequest;
46 
47 // EmojiRewriter class adds UTF-8 emoji characters in converted candidates of
48 // given segments, if each segment has a special key to convert.
49 // Added emoji characters are chosen by Yomi (reading of it) registered in
50 // a dictionary. If a segment has a key "えもじ", all emoji characters are
51 // pushed to its candidate list.
52 //
53 // Usage:
54 //
55 //   mozc::Segments segments;
56 //   mozc::Segment *segment = segments.add_segment();
57 //   mozc::Segment::Candidate *candidate = segment->add_candidate();
58 //   candidate->set_key("えもじ");
59 //
60 //   // Use one of data manager from data_manager/.
61 //   mozc::EmojiRewriter rewriter(data_manager);
62 //   rewriter.Rewrite(mozc::ConvresionRequest(), &segments);
63 //
64 // Here, the first segment of segments is expected to have all emoji
65 // characters in its candidates' values.  You can see them as such:
66 //
67 //   for (size_t i = 0; i < segment->candidate_size(); ++i) {
68 //     LOG(INFO) << segment->candidate(i).value;
69 //   }
70 class EmojiRewriter : public RewriterInterface {
71  public:
72   static const size_t kEmojiDataByteLength = 28;
73 
74   // Emoji data token is 28 bytes data of the following format:
75   //
76   // +-------------------------------------+
77   // | Key index (4 byte)                  |
78   // +-------------------------------------+
79   // | UTF8 emoji index (4 byte)           |
80   // +-------------------------------------+
81   // | Android PUA code (4 byte)           |
82   // +-------------------------------------+
83   // | UTF8 description index (4 byte)     |
84   // +-------------------------------------+
85   // | Docomo description index (4 byte)   |
86   // +-------------------------------------+
87   // | Softbank description index (4 byte) |
88   // +-------------------------------------+
89   // | KDDI description index (4 byte)     |
90   // +-------------------------------------+
91   //
92   // Here, index is the position in the string array at which the corresponding
93   // string value is stored.  Tokens are sorted in order of key so that it can
94   // be search by binary search.
95   //
96   // The following iterator class can be used to iterate over token array.
97   class EmojiDataIterator
98       : public std::iterator<std::random_access_iterator_tag, uint32> {
99    public:
EmojiDataIterator()100     EmojiDataIterator() : ptr_(nullptr) {}
EmojiDataIterator(const char * ptr)101     explicit EmojiDataIterator(const char *ptr) : ptr_(ptr) {}
102 
key_index()103     uint32 key_index() const {
104       return *reinterpret_cast<const uint32 *>(ptr_);
105     }
emoji_index()106     uint32 emoji_index() const {
107       return *reinterpret_cast<const uint32 *>(ptr_ + 4);
108     }
android_pua()109     uint32 android_pua() const {
110       return *reinterpret_cast<const uint32 *>(ptr_ + 8);
111     }
description_utf8_index()112     uint32 description_utf8_index() const {
113       return *reinterpret_cast<const uint32 *>(ptr_ + 12);
114     }
description_docomo_index()115     uint32 description_docomo_index() const {
116       return *reinterpret_cast<const uint32 *>(ptr_ + 16);
117     }
description_softbank_index()118     uint32 description_softbank_index() const {
119       return *reinterpret_cast<const uint32 *>(ptr_ + 20);
120     }
description_kddi_index()121     uint32 description_kddi_index() const {
122       return *reinterpret_cast<const uint32 *>(ptr_ + 24);
123     }
124 
125     // Returns key index as token array is searched by key.
126     uint32 operator*() const { return key_index(); }
127 
swap(EmojiDataIterator & x)128     void swap(EmojiDataIterator &x) {
129       using std::swap;
130       swap(ptr_, x.ptr_);
131     }
swap(EmojiDataIterator & x,EmojiDataIterator & y)132     friend void swap(EmojiDataIterator &x, EmojiDataIterator &y) {
133       return x.swap(y);
134     }
135 
136     EmojiDataIterator &operator++() {
137       ptr_ += kEmojiDataByteLength;
138       return *this;
139     }
140 
141     EmojiDataIterator operator++(int) {
142       const char *tmp = ptr_;
143       ptr_ += kEmojiDataByteLength;
144       return EmojiDataIterator(tmp);
145     }
146 
147     EmojiDataIterator &operator--() {
148       ptr_ -= kEmojiDataByteLength;
149       return *this;
150     }
151 
152     EmojiDataIterator operator--(int) {
153       const char *tmp = ptr_;
154       ptr_ -= kEmojiDataByteLength;
155       return EmojiDataIterator(tmp);
156     }
157 
158     EmojiDataIterator &operator+=(ptrdiff_t n) {
159       ptr_ += n * kEmojiDataByteLength;
160       return *this;
161     }
162 
163     EmojiDataIterator &operator-=(ptrdiff_t n) {
164       ptr_ -= n * kEmojiDataByteLength;
165       return *this;
166     }
167 
168     friend EmojiDataIterator operator+(EmojiDataIterator x, ptrdiff_t n) {
169       return x += n;
170     }
171 
172     friend EmojiDataIterator operator+(ptrdiff_t n, EmojiDataIterator x) {
173       return x += n;
174     }
175 
176     friend EmojiDataIterator operator-(EmojiDataIterator x, ptrdiff_t n) {
177       return x -= n;
178     }
179 
180     friend ptrdiff_t operator-(EmojiDataIterator x, EmojiDataIterator y) {
181       return (x.ptr_ - y.ptr_) / kEmojiDataByteLength;
182     }
183 
184     friend bool operator==(EmojiDataIterator x, EmojiDataIterator y) {
185       return x.ptr_ == y.ptr_;
186     }
187 
188     friend bool operator!=(EmojiDataIterator x, EmojiDataIterator y) {
189       return x.ptr_ != y.ptr_;
190     }
191 
192     friend bool operator<(EmojiDataIterator x, EmojiDataIterator y) {
193       return x.ptr_ < y.ptr_;
194     }
195 
196     friend bool operator<=(EmojiDataIterator x, EmojiDataIterator y) {
197       return x.ptr_ <= y.ptr_;
198     }
199 
200     friend bool operator>(EmojiDataIterator x, EmojiDataIterator y) {
201       return x.ptr_ > y.ptr_;
202     }
203 
204     friend bool operator>=(EmojiDataIterator x, EmojiDataIterator y) {
205       return x.ptr_ >= y.ptr_;
206     }
207 
208    private:
209     const char *ptr_ = nullptr;
210   };
211 
212   using IteratorRange = std::pair<EmojiDataIterator, EmojiDataIterator>;
213 
214   // This class does not take an ownership of |emoji_data_list|, |token_list|
215   // and |value_list|.  If NULL pointer is passed to it, Mozc process
216   // terminates with an error.
217   explicit EmojiRewriter(const DataManagerInterface &data_manager);
218   ~EmojiRewriter() override;
219 
220   int capability(const ConversionRequest &request) const override;
221 
222   // Returns true if emoji candidates are added.  When user settings are set
223   // not to use EmojiRewriter, does nothing other than returning false.
224   // Otherwise, main process are done in ReriteCandidates().
225   // A reference to a ConversionRequest instance is not used, but it is required
226   // because of the interface.
227   bool Rewrite(const ConversionRequest &request,
228                Segments *segments) const override;
229 
230   // Counts the number of segments in which emoji candidates are selected,
231   // and stores the result as usage stats.
232   // NOTE: This method is expected to be called after the segments are processed
233   // with COMMIT command in a SessionConverter instance.  May record wrong
234   // stats if you call this method in other situation.
235   void Finish(const ConversionRequest &request, Segments *segments) override;
236 
237   // Returns true if the given candidate includes emoji characters.
238   // TODO(peria, hidehiko): Unify this checker and IsEmojiEntry defined in
239   //     predictor/user_history_predictor.cc.  If you make similar functions
240   //     before the merging in case, put a same note to avoid twisted
241   //     dependency.
242   static bool IsEmojiCandidate(const Segment::Candidate &candidate);
243 
244  private:
begin()245   EmojiDataIterator begin() const {
246     return EmojiDataIterator(token_array_data_.data());
247   }
end()248   EmojiDataIterator end() const {
249     return EmojiDataIterator(
250         token_array_data_.data() + token_array_data_.size());
251   }
252 
253   // Adds emoji candidates on each segment of given segments, if it has a
254   // specific string as a key based on a dictionary.  If a segment's value is
255   // "えもじ", adds all emoji candidates.
256   // Returns true if emoji candidates are added in any segment.
257   bool RewriteCandidates(
258       int32 available_emoji_carrier, Segments *segments) const;
259 
260   IteratorRange LookUpToken(StringPiece key) const;
261 
262   StringPiece token_array_data_;
263   SerializedStringArray string_array_;
264 
265   DISALLOW_COPY_AND_ASSIGN(EmojiRewriter);
266 };
267 
268 }  // namespace mozc
269 
270 #endif  // MOZC_REWRITER_EMOJI_REWRITER_H_
271