1 // Copyright 2010-2018, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 #ifndef MOZC_REWRITER_EMOJI_REWRITER_H_ 31 #define MOZC_REWRITER_EMOJI_REWRITER_H_ 32 33 #include <cstddef> 34 #include <iterator> 35 #include <utility> 36 37 #include "base/serialized_string_array.h" 38 #include "base/string_piece.h" 39 #include "converter/segments.h" 40 #include "data_manager/data_manager_interface.h" 41 #include "rewriter/rewriter_interface.h" 42 43 namespace mozc { 44 45 class ConversionRequest; 46 47 // EmojiRewriter class adds UTF-8 emoji characters in converted candidates of 48 // given segments, if each segment has a special key to convert. 49 // Added emoji characters are chosen by Yomi (reading of it) registered in 50 // a dictionary. If a segment has a key "えもじ", all emoji characters are 51 // pushed to its candidate list. 52 // 53 // Usage: 54 // 55 // mozc::Segments segments; 56 // mozc::Segment *segment = segments.add_segment(); 57 // mozc::Segment::Candidate *candidate = segment->add_candidate(); 58 // candidate->set_key("えもじ"); 59 // 60 // // Use one of data manager from data_manager/. 61 // mozc::EmojiRewriter rewriter(data_manager); 62 // rewriter.Rewrite(mozc::ConvresionRequest(), &segments); 63 // 64 // Here, the first segment of segments is expected to have all emoji 65 // characters in its candidates' values. You can see them as such: 66 // 67 // for (size_t i = 0; i < segment->candidate_size(); ++i) { 68 // LOG(INFO) << segment->candidate(i).value; 69 // } 70 class EmojiRewriter : public RewriterInterface { 71 public: 72 static const size_t kEmojiDataByteLength = 28; 73 74 // Emoji data token is 28 bytes data of the following format: 75 // 76 // +-------------------------------------+ 77 // | Key index (4 byte) | 78 // +-------------------------------------+ 79 // | UTF8 emoji index (4 byte) | 80 // +-------------------------------------+ 81 // | Android PUA code (4 byte) | 82 // +-------------------------------------+ 83 // | UTF8 description index (4 byte) | 84 // +-------------------------------------+ 85 // | Docomo description index (4 byte) | 86 // +-------------------------------------+ 87 // | Softbank description index (4 byte) | 88 // +-------------------------------------+ 89 // | KDDI description index (4 byte) | 90 // +-------------------------------------+ 91 // 92 // Here, index is the position in the string array at which the corresponding 93 // string value is stored. Tokens are sorted in order of key so that it can 94 // be search by binary search. 95 // 96 // The following iterator class can be used to iterate over token array. 97 class EmojiDataIterator 98 : public std::iterator<std::random_access_iterator_tag, uint32> { 99 public: EmojiDataIterator()100 EmojiDataIterator() : ptr_(nullptr) {} EmojiDataIterator(const char * ptr)101 explicit EmojiDataIterator(const char *ptr) : ptr_(ptr) {} 102 key_index()103 uint32 key_index() const { 104 return *reinterpret_cast<const uint32 *>(ptr_); 105 } emoji_index()106 uint32 emoji_index() const { 107 return *reinterpret_cast<const uint32 *>(ptr_ + 4); 108 } android_pua()109 uint32 android_pua() const { 110 return *reinterpret_cast<const uint32 *>(ptr_ + 8); 111 } description_utf8_index()112 uint32 description_utf8_index() const { 113 return *reinterpret_cast<const uint32 *>(ptr_ + 12); 114 } description_docomo_index()115 uint32 description_docomo_index() const { 116 return *reinterpret_cast<const uint32 *>(ptr_ + 16); 117 } description_softbank_index()118 uint32 description_softbank_index() const { 119 return *reinterpret_cast<const uint32 *>(ptr_ + 20); 120 } description_kddi_index()121 uint32 description_kddi_index() const { 122 return *reinterpret_cast<const uint32 *>(ptr_ + 24); 123 } 124 125 // Returns key index as token array is searched by key. 126 uint32 operator*() const { return key_index(); } 127 swap(EmojiDataIterator & x)128 void swap(EmojiDataIterator &x) { 129 using std::swap; 130 swap(ptr_, x.ptr_); 131 } swap(EmojiDataIterator & x,EmojiDataIterator & y)132 friend void swap(EmojiDataIterator &x, EmojiDataIterator &y) { 133 return x.swap(y); 134 } 135 136 EmojiDataIterator &operator++() { 137 ptr_ += kEmojiDataByteLength; 138 return *this; 139 } 140 141 EmojiDataIterator operator++(int) { 142 const char *tmp = ptr_; 143 ptr_ += kEmojiDataByteLength; 144 return EmojiDataIterator(tmp); 145 } 146 147 EmojiDataIterator &operator--() { 148 ptr_ -= kEmojiDataByteLength; 149 return *this; 150 } 151 152 EmojiDataIterator operator--(int) { 153 const char *tmp = ptr_; 154 ptr_ -= kEmojiDataByteLength; 155 return EmojiDataIterator(tmp); 156 } 157 158 EmojiDataIterator &operator+=(ptrdiff_t n) { 159 ptr_ += n * kEmojiDataByteLength; 160 return *this; 161 } 162 163 EmojiDataIterator &operator-=(ptrdiff_t n) { 164 ptr_ -= n * kEmojiDataByteLength; 165 return *this; 166 } 167 168 friend EmojiDataIterator operator+(EmojiDataIterator x, ptrdiff_t n) { 169 return x += n; 170 } 171 172 friend EmojiDataIterator operator+(ptrdiff_t n, EmojiDataIterator x) { 173 return x += n; 174 } 175 176 friend EmojiDataIterator operator-(EmojiDataIterator x, ptrdiff_t n) { 177 return x -= n; 178 } 179 180 friend ptrdiff_t operator-(EmojiDataIterator x, EmojiDataIterator y) { 181 return (x.ptr_ - y.ptr_) / kEmojiDataByteLength; 182 } 183 184 friend bool operator==(EmojiDataIterator x, EmojiDataIterator y) { 185 return x.ptr_ == y.ptr_; 186 } 187 188 friend bool operator!=(EmojiDataIterator x, EmojiDataIterator y) { 189 return x.ptr_ != y.ptr_; 190 } 191 192 friend bool operator<(EmojiDataIterator x, EmojiDataIterator y) { 193 return x.ptr_ < y.ptr_; 194 } 195 196 friend bool operator<=(EmojiDataIterator x, EmojiDataIterator y) { 197 return x.ptr_ <= y.ptr_; 198 } 199 200 friend bool operator>(EmojiDataIterator x, EmojiDataIterator y) { 201 return x.ptr_ > y.ptr_; 202 } 203 204 friend bool operator>=(EmojiDataIterator x, EmojiDataIterator y) { 205 return x.ptr_ >= y.ptr_; 206 } 207 208 private: 209 const char *ptr_ = nullptr; 210 }; 211 212 using IteratorRange = std::pair<EmojiDataIterator, EmojiDataIterator>; 213 214 // This class does not take an ownership of |emoji_data_list|, |token_list| 215 // and |value_list|. If NULL pointer is passed to it, Mozc process 216 // terminates with an error. 217 explicit EmojiRewriter(const DataManagerInterface &data_manager); 218 ~EmojiRewriter() override; 219 220 int capability(const ConversionRequest &request) const override; 221 222 // Returns true if emoji candidates are added. When user settings are set 223 // not to use EmojiRewriter, does nothing other than returning false. 224 // Otherwise, main process are done in ReriteCandidates(). 225 // A reference to a ConversionRequest instance is not used, but it is required 226 // because of the interface. 227 bool Rewrite(const ConversionRequest &request, 228 Segments *segments) const override; 229 230 // Counts the number of segments in which emoji candidates are selected, 231 // and stores the result as usage stats. 232 // NOTE: This method is expected to be called after the segments are processed 233 // with COMMIT command in a SessionConverter instance. May record wrong 234 // stats if you call this method in other situation. 235 void Finish(const ConversionRequest &request, Segments *segments) override; 236 237 // Returns true if the given candidate includes emoji characters. 238 // TODO(peria, hidehiko): Unify this checker and IsEmojiEntry defined in 239 // predictor/user_history_predictor.cc. If you make similar functions 240 // before the merging in case, put a same note to avoid twisted 241 // dependency. 242 static bool IsEmojiCandidate(const Segment::Candidate &candidate); 243 244 private: begin()245 EmojiDataIterator begin() const { 246 return EmojiDataIterator(token_array_data_.data()); 247 } end()248 EmojiDataIterator end() const { 249 return EmojiDataIterator( 250 token_array_data_.data() + token_array_data_.size()); 251 } 252 253 // Adds emoji candidates on each segment of given segments, if it has a 254 // specific string as a key based on a dictionary. If a segment's value is 255 // "えもじ", adds all emoji candidates. 256 // Returns true if emoji candidates are added in any segment. 257 bool RewriteCandidates( 258 int32 available_emoji_carrier, Segments *segments) const; 259 260 IteratorRange LookUpToken(StringPiece key) const; 261 262 StringPiece token_array_data_; 263 SerializedStringArray string_array_; 264 265 DISALLOW_COPY_AND_ASSIGN(EmojiRewriter); 266 }; 267 268 } // namespace mozc 269 270 #endif // MOZC_REWRITER_EMOJI_REWRITER_H_ 271