1 /*
2  * SPDX-FileCopyrightText: 2017~2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  *
6  */
7 #include "jyutpingencoder.h"
8 #include "jyutpingdata.h"
9 #include <boost/algorithm/string.hpp>
10 #include <boost/bimap.hpp>
11 #include <queue>
12 
13 namespace libime {
14 namespace jyutping {
15 
16 static const std::string emptyString;
17 
18 template <typename L, typename R>
19 boost::bimap<L, R>
makeBimap(std::initializer_list<typename boost::bimap<L,R>::value_type> list)20 makeBimap(std::initializer_list<typename boost::bimap<L, R>::value_type> list) {
21     return boost::bimap<L, R>(list.begin(), list.end());
22 }
23 
24 static const auto initialMap = makeBimap<JyutpingInitial, std::string>({
25     {JyutpingInitial::B, "b"},   {JyutpingInitial::P, "p"},
26     {JyutpingInitial::M, "m"},   {JyutpingInitial::F, "f"},
27     {JyutpingInitial::D, "d"},   {JyutpingInitial::T, "t"},
28     {JyutpingInitial::N, "n"},   {JyutpingInitial::L, "l"},
29     {JyutpingInitial::G, "g"},   {JyutpingInitial::K, "k"},
30     {JyutpingInitial::NG, "ng"}, {JyutpingInitial::H, "h"},
31     {JyutpingInitial::GW, "gw"}, {JyutpingInitial::KW, "kw"},
32     {JyutpingInitial::W, "gw"},  {JyutpingInitial::W, "w"},
33     {JyutpingInitial::Z, "z"},   {JyutpingInitial::C, "c"},
34     {JyutpingInitial::S, "s"},   {JyutpingInitial::J, "j"},
35     {JyutpingInitial::Zero, ""},
36 });
37 
38 static const auto finalMap = makeBimap<JyutpingFinal, std::string>({
39     {JyutpingFinal::AA, "aa"},   {JyutpingFinal::AAI, "aai"},
40     {JyutpingFinal::AAU, "aau"}, {JyutpingFinal::AAM, "aam"},
41     {JyutpingFinal::AAN, "aan"}, {JyutpingFinal::AANG, "aang"},
42     {JyutpingFinal::AAP, "aap"}, {JyutpingFinal::AAT, "aat"},
43     {JyutpingFinal::AAK, "aak"}, {JyutpingFinal::AI, "ai"},
44     {JyutpingFinal::AU, "au"},   {JyutpingFinal::AM, "am"},
45     {JyutpingFinal::AN, "an"},   {JyutpingFinal::ANG, "ang"},
46     {JyutpingFinal::AP, "ap"},   {JyutpingFinal::AT, "at"},
47     {JyutpingFinal::AK, "ak"},   {JyutpingFinal::E, "e"},
48     {JyutpingFinal::EI, "ei"},   {JyutpingFinal::ET, "et"},
49     {JyutpingFinal::EU, "eu"},   {JyutpingFinal::EM, "em"},
50     {JyutpingFinal::EN, "en"},   {JyutpingFinal::ENG, "eng"},
51     {JyutpingFinal::EP, "ep"},   {JyutpingFinal::EK, "ek"},
52     {JyutpingFinal::I, "i"},     {JyutpingFinal::IU, "iu"},
53     {JyutpingFinal::IM, "im"},   {JyutpingFinal::IN, "in"},
54     {JyutpingFinal::ING, "ing"}, {JyutpingFinal::IP, "ip"},
55     {JyutpingFinal::IT, "it"},   {JyutpingFinal::IK, "ik"},
56     {JyutpingFinal::O, "o"},     {JyutpingFinal::OI, "oi"},
57     {JyutpingFinal::OU, "ou"},   {JyutpingFinal::OM, "om"},
58     {JyutpingFinal::ON, "on"},   {JyutpingFinal::ONG, "ong"},
59     {JyutpingFinal::OT, "ot"},   {JyutpingFinal::OK, "ok"},
60     {JyutpingFinal::OE, "oe"},   {JyutpingFinal::OENG, "oeng"},
61     {JyutpingFinal::OEK, "oek"}, {JyutpingFinal::EOI, "eoi"},
62     {JyutpingFinal::EON, "eon"}, {JyutpingFinal::EOT, "eot"},
63     {JyutpingFinal::U, "u"},     {JyutpingFinal::UI, "ui"},
64     {JyutpingFinal::UN, "un"},   {JyutpingFinal::UNG, "ung"},
65     {JyutpingFinal::UT, "ut"},   {JyutpingFinal::UK, "uk"},
66     {JyutpingFinal::YU, "yu"},   {JyutpingFinal::YUN, "yun"},
67     {JyutpingFinal::YUT, "yut"}, {JyutpingFinal::M, "m"},
68     {JyutpingFinal::NG, "ng"},   {JyutpingFinal::Zero, ""},
69 });
70 
71 static const int maxJyutpingLength = 6;
72 
73 template <typename Iter>
longestMatch(Iter iter,Iter end)74 std::pair<std::string_view, bool> longestMatch(Iter iter, Iter end) {
75     if (std::distance(iter, end) > maxJyutpingLength) {
76         end = iter + maxJyutpingLength;
77     }
78     auto range = std::string_view(&*iter, std::distance(iter, end));
79     auto &map = getJyutpingMap();
80     for (; range.size(); range.remove_suffix(1)) {
81         auto iterPair = map.equal_range(range);
82         if (iterPair.first != iterPair.second) {
83             // do not consider m/ng as complete jyutping
84             return std::make_pair(range, (range != "m" && range != "ng"));
85         }
86         if (range.size() <= 2) {
87             auto iter = initialMap.right.find(std::string{range});
88             if (iter != initialMap.right.end()) {
89                 return std::make_pair(range, false);
90             }
91         }
92     }
93 
94     if (!range.size()) {
95         range = std::string_view(&*iter, 1);
96     }
97 
98     return std::make_pair(range, false);
99 }
100 
toString() const101 std::string JyutpingSyllable::toString() const {
102     return JyutpingEncoder::initialToString(initial_) +
103            JyutpingEncoder::finalToString(final_);
104 }
105 
parseUserJyutping(std::string userJyutping,bool inner)106 SegmentGraph JyutpingEncoder::parseUserJyutping(std::string userJyutping,
107                                                 bool inner) {
108     SegmentGraph result(std::move(userJyutping));
109     const auto &jyutping = result.data();
110     auto end = jyutping.end();
111     std::priority_queue<size_t, std::vector<size_t>, std::greater<size_t>> q;
112     q.push(0);
113     while (q.size()) {
114         size_t top;
115         do {
116             top = q.top();
117             q.pop();
118         } while (q.size() && q.top() == top);
119         if (top >= jyutping.size()) {
120             continue;
121         }
122         auto iter = std::next(jyutping.begin(), top);
123         if (*iter == '\'') {
124             while (*iter == '\'' && iter != jyutping.end()) {
125                 iter++;
126             }
127             auto next = std::distance(jyutping.begin(), iter);
128             result.addNext(top, next);
129             if (static_cast<size_t>(next) < jyutping.size()) {
130                 q.push(next);
131             }
132             continue;
133         }
134         std::string_view str;
135         bool isCompleteJyutping;
136         std::tie(str, isCompleteJyutping) = longestMatch(iter, end);
137 
138         // it's not complete a jyutping, no need to try
139         if (!isCompleteJyutping) {
140             result.addNext(top, top + str.size());
141             q.push(top + str.size());
142         } else {
143             // check fuzzy seg
144             // jyutping may end with aegikmnoptu
145             // and may start with abcdefghjklmnopstuwz.
146             // the intersection is aegkmnoptu.
147             // also, make sure current jyutping does not end with a separator.
148             auto &map = getJyutpingMap();
149             std::array<size_t, 2> nextSize;
150             size_t nNextSize = 0;
151             if (str.size() > 1 && top + str.size() < jyutping.size() &&
152                 jyutping[top + str.size()] != '\'' &&
153                 (str.back() == 'a' || str.back() == 'e' || str.back() == 'g' ||
154                  str.back() == 'k' || str.back() == 'm' || str.back() == 'n' ||
155                  str.back() == 'o' || str.back() == 'p' || str.back() == 't' ||
156                  str.back() == 'u') &&
157                 map.find(str.substr(0, str.size() - 1)) != map.end()) {
158                 // str[0:-1] is also a full jyutping, check next jyutping
159                 auto nextMatch = longestMatch(iter + str.size(), end);
160                 auto nextMatchAlt = longestMatch(iter + str.size() - 1, end);
161                 auto matchSize = str.size() + nextMatch.first.size();
162                 auto matchSizeAlt = str.size() - 1 + nextMatchAlt.first.size();
163                 if (std::make_pair(matchSize, nextMatch.second) >=
164                     std::make_pair(matchSizeAlt, nextMatchAlt.second)) {
165                     result.addNext(top, top + str.size());
166                     q.push(top + str.size());
167                     nextSize[nNextSize++] = str.size();
168                 }
169                 if (std::make_pair(matchSize, nextMatch.second) <=
170                     std::make_pair(matchSizeAlt, nextMatchAlt.second)) {
171                     result.addNext(top, top + str.size() - 1);
172                     q.push(top + str.size() - 1);
173                     nextSize[nNextSize++] = str.size() - 1;
174                 }
175             } else {
176                 result.addNext(top, top + str.size());
177                 q.push(top + str.size());
178                 nextSize[nNextSize++] = str.size();
179             }
180 
181             for (size_t i = 0; i < nNextSize; i++) {
182                 if (nextSize[i] >= 4 && inner) {
183                     auto &innerSegments = getInnerSegment();
184                     auto iter = innerSegments.find(
185                         std::string{str.substr(0, nextSize[i])});
186                     if (iter != innerSegments.end()) {
187                         result.addNext(top, top + iter->second.first.size());
188                         result.addNext(top + iter->second.first.size(),
189                                        top + nextSize[i]);
190                     }
191                 }
192             }
193         }
194     }
195     return result;
196 }
197 
encodeOneUserJyutping(std::string jyutping)198 std::vector<char> JyutpingEncoder::encodeOneUserJyutping(std::string jyutping) {
199     if (jyutping.empty()) {
200         return {};
201     }
202     auto graph = parseUserJyutping(std::move(jyutping), false);
203     std::vector<char> result;
204     const SegmentGraphNode *node = &graph.start(), *prev = nullptr;
205     while (node->nextSize()) {
206         prev = node;
207         node = &node->nexts().front();
208         auto seg = graph.segment(*prev, *node);
209         if (seg.empty() || seg[0] == '\'') {
210             continue;
211         }
212         auto syls = stringToSyllables(seg);
213         if (!syls.size()) {
214             return {};
215         }
216         result.push_back(static_cast<char>(syls[0].first));
217         result.push_back(static_cast<char>(syls[0].second[0].first));
218     }
219     return result;
220 }
221 
isValidUserJyutping(const char * data,size_t size)222 bool JyutpingEncoder::isValidUserJyutping(const char *data, size_t size) {
223     if (size % 2 != 0) {
224         return false;
225     }
226 
227     for (size_t i = 0; i < size / 2; i++) {
228         if (!JyutpingEncoder::isValidInitial(data[i * 2])) {
229             return false;
230         }
231     }
232     return true;
233 }
234 
decodeFullJyutping(const char * data,size_t size)235 std::string JyutpingEncoder::decodeFullJyutping(const char *data, size_t size) {
236     if (size % 2 != 0) {
237         throw std::invalid_argument("invalid jyutping key");
238     }
239     std::string result;
240     for (size_t i = 0, e = size / 2; i < e; i++) {
241         if (i) {
242             result += '\'';
243         }
244         result += initialToString(static_cast<JyutpingInitial>(data[i * 2]));
245         result += finalToString(static_cast<JyutpingFinal>(data[i * 2 + 1]));
246     }
247     return result;
248 }
249 
initialToString(JyutpingInitial initial)250 const std::string &JyutpingEncoder::initialToString(JyutpingInitial initial) {
251     const static std::vector<std::string> s = []() {
252         std::vector<std::string> s;
253         s.resize(lastInitial - firstInitial + 1);
254         for (char c = firstInitial; c <= lastInitial; c++) {
255             auto iter = initialMap.left.find(static_cast<JyutpingInitial>(c));
256             s[c - firstInitial] = iter->second;
257         }
258         return s;
259     }();
260     auto c = static_cast<char>(initial);
261     if (c >= firstInitial && c <= lastInitial) {
262         return s[c - firstInitial];
263     }
264     return emptyString;
265 }
266 
stringToInitial(const std::string & str)267 JyutpingInitial JyutpingEncoder::stringToInitial(const std::string &str) {
268     auto iter = initialMap.right.find(str);
269     if (iter != initialMap.right.end()) {
270         return iter->second;
271     }
272     return JyutpingInitial::Invalid;
273 }
274 
finalToString(JyutpingFinal final)275 const std::string &JyutpingEncoder::finalToString(JyutpingFinal final) {
276     const static std::vector<std::string> s = []() {
277         std::vector<std::string> s;
278         s.resize(lastFinal - firstFinal + 1);
279         for (char c = firstFinal; c <= lastFinal; c++) {
280             auto iter = finalMap.left.find(static_cast<JyutpingFinal>(c));
281             s[c - firstFinal] = iter->second;
282         }
283         return s;
284     }();
285     auto c = static_cast<char>(final);
286     if (c >= firstFinal && c <= lastFinal) {
287         return s[c - firstFinal];
288     }
289     return emptyString;
290 }
291 
stringToFinal(const std::string & str)292 JyutpingFinal JyutpingEncoder::stringToFinal(const std::string &str) {
293     auto iter = finalMap.right.find(str);
294     if (iter != finalMap.right.end()) {
295         return iter->second;
296     }
297     return JyutpingFinal::Invalid;
298 }
299 
isValidInitialFinal(JyutpingInitial initial,JyutpingFinal final)300 bool JyutpingEncoder::isValidInitialFinal(JyutpingInitial initial,
301                                           JyutpingFinal final) {
302     if (initial != JyutpingInitial::Invalid &&
303         final != JyutpingFinal::Invalid) {
304         int16_t encode =
305             ((static_cast<int16_t>(initial) - JyutpingEncoder::firstInitial) *
306              (JyutpingEncoder::lastFinal - JyutpingEncoder::firstFinal + 1)) +
307             (static_cast<int16_t>(final) - JyutpingEncoder::firstFinal);
308         const auto &a = getEncodedInitialFinal();
309         return encode < static_cast<int>(a.size()) && a[encode];
310     }
311     return false;
312 }
313 
getFuzzy(std::vector<std::pair<JyutpingInitial,std::vector<std::pair<JyutpingFinal,bool>>>> & syls,JyutpingSyllable syl)314 static void getFuzzy(
315     std::vector<std::pair<JyutpingInitial,
316                           std::vector<std::pair<JyutpingFinal, bool>>>> &syls,
317     JyutpingSyllable syl) {
318     JyutpingInitial initials[2] = {syl.initial(), JyutpingInitial::Invalid};
319     JyutpingFinal finals[2] = {syl.final(), JyutpingFinal::Invalid};
320     int initialSize = 1;
321     int finalSize = 1;
322 
323     for (int i = 0; i < initialSize; i++) {
324         for (int j = 0; j < finalSize; j++) {
325             auto initial = initials[i];
326             auto final = finals[j];
327             if ((i == 0 && j == 0) || final == JyutpingFinal::Invalid ||
328                 JyutpingEncoder::isValidInitialFinal(initial, final)) {
329                 auto iter = std::find_if(
330                     syls.begin(), syls.end(),
331                     [initial](const auto &p) { return p.first == initial; });
332                 if (iter == syls.end()) {
333                     syls.emplace_back(std::piecewise_construct,
334                                       std::forward_as_tuple(initial),
335                                       std::forward_as_tuple());
336                     iter = std::prev(syls.end());
337                 }
338                 auto &finals = iter->second;
339                 if (std::find_if(finals.begin(), finals.end(),
340                                  [final](auto &p) {
341                                      return p.first == final;
342                                  }) == finals.end()) {
343                     finals.emplace_back(final, i > 0 || j > 0);
344                 }
345             }
346         }
347     }
348 }
349 
350 MatchedJyutpingSyllables
stringToSyllables(std::string_view jyutping)351 JyutpingEncoder::stringToSyllables(std::string_view jyutping) {
352     std::vector<
353         std::pair<JyutpingInitial, std::vector<std::pair<JyutpingFinal, bool>>>>
354         result;
355     auto &map = getJyutpingMap();
356     // we only want {M,N,R}/Invalid instead of {M,N,R}/Zero, so we could get
357     // match for everything.
358     if (jyutping != "m" && jyutping != "ng") {
359         auto iterPair = map.equal_range(jyutping);
360         for (auto &item :
361              boost::make_iterator_range(iterPair.first, iterPair.second)) {
362             getFuzzy(result, {item.initial(), item.final()});
363         }
364     }
365 
366     auto iter = initialMap.right.find(std::string{jyutping});
367     if (initialMap.right.end() != iter) {
368         getFuzzy(result, {iter->second, JyutpingFinal::Invalid});
369     }
370 
371     if (result.size() == 0) {
372         result.emplace_back(
373             std::piecewise_construct,
374             std::forward_as_tuple(JyutpingInitial::Invalid),
375             std::forward_as_tuple(
376                 1, std::make_pair(JyutpingFinal::Invalid, false)));
377     }
378 
379     return result;
380 }
381 
382 std::vector<char>
encodeFullJyutping(std::string_view jyutping)383 JyutpingEncoder::encodeFullJyutping(std::string_view jyutping) {
384     std::vector<std::string> jyutpings;
385     boost::split(jyutpings, jyutping, boost::is_any_of("'"));
386     std::vector<char> result;
387     result.resize(jyutpings.size() * 2);
388     int idx = 0;
389     for (const auto &singleJyutping : jyutpings) {
390         auto &map = getJyutpingMap();
391         auto iter = map.find(singleJyutping);
392         if (iter == map.end()) {
393             throw std::invalid_argument("invalid full jyutping: " +
394                                         std::string{jyutping});
395         }
396         result[idx++] = static_cast<char>(iter->initial());
397         result[idx++] = static_cast<char>(iter->final());
398     }
399 
400     return result;
401 }
402 
403 } // namespace jyutping
404 } // namespace libime
405