1 /*
2  * SPDX-FileCopyrightText: 2017-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  */
6 #include "pinyinencoder.h"
7 #include "pinyindata.h"
8 #include "shuangpinprofile.h"
9 #include <boost/algorithm/string.hpp>
10 #include <boost/bimap.hpp>
11 #include <boost/bimap/unordered_set_of.hpp>
12 #include <boost/container/static_vector.hpp>
13 #include <fcitx-utils/charutils.h>
14 #include <queue>
15 #include <sstream>
16 #include <string_view>
17 #include <tuple>
18 #include <unordered_map>
19 
20 namespace libime {
21 
22 static const std::string emptyString;
23 
operator <<(fcitx::LogMessageBuilder & log,PinyinFuzzyFlags fuzzy)24 fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log,
25                                      PinyinFuzzyFlags fuzzy) {
26     log << fuzzy.toInteger();
27     return log;
28 }
29 
operator <<(fcitx::LogMessageBuilder & log,PinyinInitial initial)30 fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log,
31                                      PinyinInitial initial) {
32     log << PinyinEncoder::initialToString(initial);
33     return log;
34 }
35 
operator <<(fcitx::LogMessageBuilder & log,PinyinFinal final)36 fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log,
37                                      PinyinFinal final) {
38     log << PinyinEncoder::finalToString(final);
39     return log;
40 }
41 
operator <<(fcitx::LogMessageBuilder & log,PinyinSyllable syl)42 fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log,
43                                      PinyinSyllable syl) {
44     log << syl.toString();
45     return log;
46 }
47 
48 template <typename L, typename R>
49 boost::bimap<L, R>
makeBimap(std::initializer_list<typename boost::bimap<L,R>::value_type> list)50 makeBimap(std::initializer_list<typename boost::bimap<L, R>::value_type> list) {
51     return boost::bimap<L, R>(list.begin(), list.end());
52 }
53 
54 static const auto initialMap = makeBimap<PinyinInitial, std::string>({
55     {PinyinInitial::B, "b"},   {PinyinInitial::P, "p"},
56     {PinyinInitial::M, "m"},   {PinyinInitial::F, "f"},
57     {PinyinInitial::D, "d"},   {PinyinInitial::T, "t"},
58     {PinyinInitial::N, "n"},   {PinyinInitial::L, "l"},
59     {PinyinInitial::G, "g"},   {PinyinInitial::K, "k"},
60     {PinyinInitial::H, "h"},   {PinyinInitial::J, "j"},
61     {PinyinInitial::Q, "q"},   {PinyinInitial::X, "x"},
62     {PinyinInitial::ZH, "zh"}, {PinyinInitial::CH, "ch"},
63     {PinyinInitial::SH, "sh"}, {PinyinInitial::R, "r"},
64     {PinyinInitial::Z, "z"},   {PinyinInitial::C, "c"},
65     {PinyinInitial::S, "s"},   {PinyinInitial::Y, "y"},
66     {PinyinInitial::W, "w"},   {PinyinInitial::Zero, ""},
67 });
68 
69 static const auto finalMap = makeBimap<PinyinFinal, std::string>({
70     {PinyinFinal::A, "a"},       {PinyinFinal::AI, "ai"},
71     {PinyinFinal::AN, "an"},     {PinyinFinal::ANG, "ang"},
72     {PinyinFinal::AO, "ao"},     {PinyinFinal::E, "e"},
73     {PinyinFinal::EI, "ei"},     {PinyinFinal::EN, "en"},
74     {PinyinFinal::ENG, "eng"},   {PinyinFinal::ER, "er"},
75     {PinyinFinal::O, "o"},       {PinyinFinal::ONG, "ong"},
76     {PinyinFinal::OU, "ou"},     {PinyinFinal::I, "i"},
77     {PinyinFinal::IA, "ia"},     {PinyinFinal::IE, "ie"},
78     {PinyinFinal::IAO, "iao"},   {PinyinFinal::IU, "iu"},
79     {PinyinFinal::IAN, "ian"},   {PinyinFinal::IN, "in"},
80     {PinyinFinal::IANG, "iang"}, {PinyinFinal::ING, "ing"},
81     {PinyinFinal::IONG, "iong"}, {PinyinFinal::U, "u"},
82     {PinyinFinal::UA, "ua"},     {PinyinFinal::UO, "uo"},
83     {PinyinFinal::UAI, "uai"},   {PinyinFinal::UI, "ui"},
84     {PinyinFinal::UAN, "uan"},   {PinyinFinal::UN, "un"},
85     {PinyinFinal::UANG, "uang"}, {PinyinFinal::V, "v"},
86     {PinyinFinal::UE, "ue"},     {PinyinFinal::VE, "ve"},
87     {PinyinFinal::NG, "ng"},     {PinyinFinal::Zero, ""},
88 });
89 
90 static const int maxPinyinLength = 6;
91 
92 template <typename Iter>
longestMatch(Iter iter,Iter end,PinyinFuzzyFlags flags)93 std::pair<std::string_view, bool> longestMatch(Iter iter, Iter end,
94                                                PinyinFuzzyFlags flags) {
95     if (std::distance(iter, end) > maxPinyinLength) {
96         end = iter + maxPinyinLength;
97     }
98     auto range = std::string_view(&*iter, std::distance(iter, end));
99     const auto &map = getPinyinMap();
100     for (; !range.empty(); range.remove_suffix(1)) {
101         auto iterPair = map.equal_range(range);
102         if (iterPair.first != iterPair.second) {
103             for (const auto &item :
104                  boost::make_iterator_range(iterPair.first, iterPair.second)) {
105                 if (flags.test(item.flags())) {
106                     // do not consider m/n/r as complete pinyin
107                     return std::make_pair(
108                         range, (range != "m" && range != "n" && range != "r"));
109                 }
110             }
111         }
112         if (range.size() <= 2) {
113             auto iter = initialMap.right.find(std::string{range});
114             if (iter != initialMap.right.end()) {
115                 return std::make_pair(range, false);
116             }
117         }
118     }
119 
120     if (range.empty()) {
121         range = std::string_view(&*iter, 1);
122     }
123 
124     return std::make_pair(range, false);
125 }
126 
toString() const127 std::string PinyinSyllable::toString() const {
128     return PinyinEncoder::initialToString(initial_) +
129            PinyinEncoder::finalToString(final_);
130 }
131 
parseUserPinyin(std::string userPinyin,PinyinFuzzyFlags flags)132 SegmentGraph PinyinEncoder::parseUserPinyin(std::string userPinyin,
133                                             PinyinFuzzyFlags flags) {
134     SegmentGraph result{std::move(userPinyin)};
135     auto pinyin = result.data();
136     std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(),
137                    fcitx::charutils::tolower);
138     auto end = pinyin.end();
139     std::priority_queue<size_t, std::vector<size_t>, std::greater<size_t>> q;
140     q.push(0);
141     while (!q.empty()) {
142         size_t top;
143         do {
144             top = q.top();
145             q.pop();
146         } while (!q.empty() && q.top() == top);
147         if (top >= pinyin.size()) {
148             continue;
149         }
150         auto iter = std::next(pinyin.begin(), top);
151         if (*iter == '\'') {
152             while (*iter == '\'' && iter != pinyin.end()) {
153                 iter++;
154             }
155             auto next = std::distance(pinyin.begin(), iter);
156             result.addNext(top, next);
157             if (static_cast<size_t>(next) < pinyin.size()) {
158                 q.push(next);
159             }
160             continue;
161         }
162         std::string_view str;
163         bool isCompletePinyin;
164         std::tie(str, isCompletePinyin) = longestMatch(iter, end, flags);
165 
166         // it's not complete a pinyin, no need to try
167         if (!isCompletePinyin) {
168             result.addNext(top, top + str.size());
169             q.push(top + str.size());
170         } else {
171             // check fuzzy seg
172             // pinyin may end with aegimnoruv
173             // and may start with abcdefghjklmnopqrstwxyz.
174             // the intersection is aegmnor, while for m, it only 'm', so don't
175             // consider it
176             // also, make sure current pinyin does not end with a separator,
177             // other wise, jin'an may be parsed into ji'n because, nextMatch is
178             // starts with "'".
179             const auto &map = getPinyinMap();
180             std::array<size_t, 2> nextSize;
181             size_t nNextSize = 0;
182             if (str.size() > 1 && top + str.size() < pinyin.size() &&
183                 pinyin[top + str.size()] != '\'' &&
184                 (str.back() == 'a' || str.back() == 'e' || str.back() == 'g' ||
185                  str.back() == 'n' || str.back() == 'o' || str.back() == 'r') &&
186                 map.find(str.substr(0, str.size() - 1)) != map.end()) {
187                 // str[0:-1] is also a full pinyin, check next pinyin
188                 auto nextMatch = longestMatch(iter + str.size(), end, flags);
189                 auto nextMatchAlt =
190                     longestMatch(iter + str.size() - 1, end, flags);
191                 auto matchSize = str.size() + nextMatch.first.size();
192                 auto matchSizeAlt = str.size() - 1 + nextMatchAlt.first.size();
193                 if (std::make_pair(matchSize, nextMatch.second) >=
194                     std::make_pair(matchSizeAlt, nextMatchAlt.second)) {
195                     result.addNext(top, top + str.size());
196                     q.push(top + str.size());
197                     nextSize[nNextSize++] = str.size();
198                 }
199                 if (std::make_pair(matchSize, nextMatch.second) <=
200                     std::make_pair(matchSizeAlt, nextMatchAlt.second)) {
201                     result.addNext(top, top + str.size() - 1);
202                     q.push(top + str.size() - 1);
203                     nextSize[nNextSize++] = str.size() - 1;
204                 }
205             } else {
206                 result.addNext(top, top + str.size());
207                 q.push(top + str.size());
208                 nextSize[nNextSize++] = str.size();
209             }
210 
211             for (size_t i = 0; i < nNextSize; i++) {
212                 if ((nextSize[i] >= 4 && flags.test(PinyinFuzzyFlag::Inner)) ||
213                     (nextSize[i] == 3 &&
214                      flags.test(PinyinFuzzyFlag::InnerShort))) {
215                     const auto &innerSegments = getInnerSegment();
216                     auto iter = innerSegments.find(
217                         std::string{str.substr(0, nextSize[i])});
218                     if (iter != innerSegments.end()) {
219                         result.addNext(top, top + iter->second.first.size());
220                         result.addNext(top + iter->second.first.size(),
221                                        top + nextSize[i]);
222                     }
223                 } else if (nextSize[i] == 2 &&
224                            flags.test(PinyinFuzzyFlag::InnerShort) &&
225                            str.substr(0, 2) == "ng") {
226                     // Handle ng -> n'g, the condition is so simple so we don't
227                     // make it go through the inner segment lookup.
228                     result.addNext(top, top + 1);
229                     result.addNext(top + 1, top + 2);
230                 }
231             }
232         }
233     }
234     return result;
235 }
236 
parseUserShuangpin(std::string userPinyin,const ShuangpinProfile & sp,PinyinFuzzyFlags flags)237 SegmentGraph PinyinEncoder::parseUserShuangpin(std::string userPinyin,
238                                                const ShuangpinProfile &sp,
239                                                PinyinFuzzyFlags flags) {
240     SegmentGraph result{std::move(userPinyin)};
241     auto pinyin = result.data();
242     std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(),
243                    fcitx::charutils::tolower);
244 
245     // assume user always type valid shuangpin first, if not keep one.
246     size_t i = 0;
247 
248     const auto &table = sp.table();
249     while (i < pinyin.size()) {
250         auto start = i;
251         while (pinyin[i] == '\'' && i < pinyin.size()) {
252             i++;
253         }
254         if (start != i) {
255             result.addNext(start, i);
256             continue;
257         }
258         auto initial = pinyin[i];
259         char final = '\0';
260         if (i + 1 < pinyin.size() && pinyin[i + 1] != '\'') {
261             final = pinyin[i + 1];
262         }
263 
264         std::string match{initial};
265         if (final) {
266             match.push_back(final);
267         }
268 
269         auto longestMatchInTable = [flags](decltype(table) t,
270                                            const std::string &v) {
271             auto py = v;
272             while (!py.empty()) {
273                 auto iter = t.find(py);
274                 if (iter != t.end()) {
275                     for (const auto &p : iter->second) {
276                         if (flags.test(p.second)) {
277                             return iter;
278                         }
279                     }
280                 }
281                 py.pop_back();
282             }
283             return t.end();
284         };
285 
286         auto iter = longestMatchInTable(table, match);
287         if (iter != table.end()) {
288             result.addNext(i, i + iter->first.size());
289             i = i + iter->first.size();
290         } else {
291             result.addNext(i, i + 1);
292             i = i + 1;
293         }
294     }
295 
296     return result;
297 }
298 
encodeFullPinyin(std::string_view pinyin)299 std::vector<char> PinyinEncoder::encodeFullPinyin(std::string_view pinyin) {
300     std::vector<std::string> pinyins;
301     boost::split(pinyins, pinyin, boost::is_any_of("'"));
302     std::vector<char> result;
303     result.resize(pinyins.size() * 2);
304     int idx = 0;
305     for (const auto &singlePinyin : pinyins) {
306         const auto &map = getPinyinMap();
307         auto iter = map.find(singlePinyin);
308         if (iter == map.end() || iter->flags() != PinyinFuzzyFlag::None) {
309             throw std::invalid_argument("invalid full pinyin: " +
310                                         std::string{pinyin});
311         }
312         result[idx++] = static_cast<char>(iter->initial());
313         result[idx++] = static_cast<char>(iter->final());
314     }
315 
316     return result;
317 }
318 
encodeOneUserPinyin(std::string pinyin)319 std::vector<char> PinyinEncoder::encodeOneUserPinyin(std::string pinyin) {
320     if (pinyin.empty()) {
321         return {};
322     }
323     auto graph = parseUserPinyin(std::move(pinyin), PinyinFuzzyFlag::None);
324     std::vector<char> result;
325     const SegmentGraphNode *node = &graph.start(), *prev = nullptr;
326     while (node->nextSize()) {
327         prev = node;
328         node = &node->nexts().front();
329         auto seg = graph.segment(*prev, *node);
330         if (seg.empty() || seg[0] == '\'') {
331             continue;
332         }
333         auto syls = stringToSyllables(seg, PinyinFuzzyFlag::None);
334         if (syls.empty()) {
335             return {};
336         }
337         result.push_back(static_cast<char>(syls[0].first));
338         result.push_back(static_cast<char>(syls[0].second[0].first));
339     }
340     return result;
341 }
342 
isValidUserPinyin(const char * data,size_t size)343 bool PinyinEncoder::isValidUserPinyin(const char *data, size_t size) {
344     if (size % 2 != 0) {
345         return false;
346     }
347 
348     for (size_t i = 0; i < size / 2; i++) {
349         if (!PinyinEncoder::isValidInitial(data[i * 2])) {
350             return false;
351         }
352     }
353     return true;
354 }
355 
decodeFullPinyin(const char * data,size_t size)356 std::string PinyinEncoder::decodeFullPinyin(const char *data, size_t size) {
357     if (size % 2 != 0) {
358         throw std::invalid_argument("invalid pinyin key");
359     }
360     std::string result;
361     for (size_t i = 0, e = size / 2; i < e; i++) {
362         if (i) {
363             result += '\'';
364         }
365         result += initialToString(static_cast<PinyinInitial>(data[i * 2]));
366         result += finalToString(static_cast<PinyinFinal>(data[i * 2 + 1]));
367     }
368     return result;
369 }
370 
initialToString(PinyinInitial initial)371 const std::string &PinyinEncoder::initialToString(PinyinInitial initial) {
372     const static std::vector<std::string> s = []() {
373         std::vector<std::string> s;
374         s.resize(lastInitial - firstInitial + 1);
375         for (char c = firstInitial; c <= lastInitial; c++) {
376             auto iter = initialMap.left.find(static_cast<PinyinInitial>(c));
377             s[c - firstInitial] = iter->second;
378         }
379         return s;
380     }();
381     auto c = static_cast<char>(initial);
382     if (c >= firstInitial && c <= lastInitial) {
383         return s[c - firstInitial];
384     }
385     return emptyString;
386 }
387 
stringToInitial(const std::string & str)388 PinyinInitial PinyinEncoder::stringToInitial(const std::string &str) {
389     auto iter = initialMap.right.find(str);
390     if (iter != initialMap.right.end()) {
391         return iter->second;
392     }
393     return PinyinInitial::Invalid;
394 }
395 
finalToString(PinyinFinal final)396 const std::string &PinyinEncoder::finalToString(PinyinFinal final) {
397     const static std::vector<std::string> s = []() {
398         std::vector<std::string> s;
399         s.resize(lastFinal - firstFinal + 1);
400         for (char c = firstFinal; c <= lastFinal; c++) {
401             auto iter = finalMap.left.find(static_cast<PinyinFinal>(c));
402             s[c - firstFinal] = iter->second;
403         }
404         return s;
405     }();
406     auto c = static_cast<char>(final);
407     if (c >= firstFinal && c <= lastFinal) {
408         return s[c - firstFinal];
409     }
410     return emptyString;
411 }
412 
stringToFinal(const std::string & str)413 PinyinFinal PinyinEncoder::stringToFinal(const std::string &str) {
414     auto iter = finalMap.right.find(str);
415     if (iter != finalMap.right.end()) {
416         return iter->second;
417     }
418     return PinyinFinal::Invalid;
419 }
420 
isValidInitialFinal(PinyinInitial initial,PinyinFinal final)421 bool PinyinEncoder::isValidInitialFinal(PinyinInitial initial,
422                                         PinyinFinal final) {
423     if (initial != PinyinInitial::Invalid && final != PinyinFinal::Invalid) {
424         int16_t encode =
425             ((static_cast<int16_t>(initial) - PinyinEncoder::firstInitial) *
426              (PinyinEncoder::lastFinal - PinyinEncoder::firstFinal + 1)) +
427             (static_cast<int16_t>(final) - PinyinEncoder::firstFinal);
428         const auto &a = getEncodedInitialFinal();
429         return encode < static_cast<int>(a.size()) && a[encode];
430     }
431     return false;
432 }
433 
initialFinalToPinyinString(PinyinInitial initial,PinyinFinal final)434 std::string PinyinEncoder::initialFinalToPinyinString(PinyinInitial initial,
435                                                       PinyinFinal final) {
436     std::string result = initialToString(initial);
437     std::string finalString;
438     switch (final) {
439     case PinyinFinal::VE:
440     case PinyinFinal::V:
441         if (initial == PinyinInitial::N || initial == PinyinInitial::L) {
442             if (final == PinyinFinal::VE) {
443                 finalString = "üe";
444             } else {
445                 finalString = "ü";
446             }
447             break;
448         }
449         // FALLTHROUGH
450     default:
451         finalString = finalToString(final);
452         break;
453     }
454     result.append(finalString);
455     return result;
456 }
457 
getFuzzy(std::vector<std::pair<PinyinInitial,std::vector<std::pair<PinyinFinal,bool>>>> & syls,PinyinSyllable syl,PinyinFuzzyFlags flags)458 static void getFuzzy(
459     std::vector<std::pair<PinyinInitial,
460                           std::vector<std::pair<PinyinFinal, bool>>>> &syls,
461     PinyinSyllable syl, PinyinFuzzyFlags flags) {
462     // ng/gn is already handled by table
463     boost::container::static_vector<PinyinInitial, 2> initials{syl.initial()};
464     boost::container::static_vector<PinyinFinal, 10> finals{syl.final()};
465 
466     // for {s,z,c} we also want them to match {sh,zh,ch}
467     if (syl.final() == PinyinFinal::Invalid) {
468         if (syl.initial() == PinyinInitial::C) {
469             flags |= PinyinFuzzyFlag::C_CH;
470         }
471         if (syl.initial() == PinyinInitial::Z) {
472             flags |= PinyinFuzzyFlag::Z_ZH;
473         }
474         if (syl.initial() == PinyinInitial::S) {
475             flags |= PinyinFuzzyFlag::S_SH;
476         }
477     }
478 
479     const static std::vector<
480         std::tuple<PinyinInitial, PinyinInitial, PinyinFuzzyFlag>>
481         initialFuzzies = {
482             {PinyinInitial::C, PinyinInitial::CH, PinyinFuzzyFlag::C_CH},
483             {PinyinInitial::S, PinyinInitial::SH, PinyinFuzzyFlag::S_SH},
484             {PinyinInitial::Z, PinyinInitial::ZH, PinyinFuzzyFlag::Z_ZH},
485             {PinyinInitial::F, PinyinInitial::H, PinyinFuzzyFlag::F_H},
486             {PinyinInitial::L, PinyinInitial::N, PinyinFuzzyFlag::L_N},
487         };
488 
489     for (const auto &initialFuzzy : initialFuzzies) {
490         if ((syl.initial() == std::get<0>(initialFuzzy) ||
491              syl.initial() == std::get<1>(initialFuzzy)) &&
492             flags & std::get<2>(initialFuzzy)) {
493             initials.push_back(syl.initial() == std::get<0>(initialFuzzy)
494                                    ? std::get<1>(initialFuzzy)
495                                    : std::get<0>(initialFuzzy));
496             break;
497         }
498     }
499 
500     const static std::vector<
501         std::tuple<PinyinFinal, PinyinFinal, PinyinFuzzyFlag>>
502         finalFuzzies = {
503             {PinyinFinal::V, PinyinFinal::U, PinyinFuzzyFlag::V_U},
504             {PinyinFinal::AN, PinyinFinal::ANG, PinyinFuzzyFlag::AN_ANG},
505             {PinyinFinal::EN, PinyinFinal::ENG, PinyinFuzzyFlag::EN_ENG},
506             {PinyinFinal::IAN, PinyinFinal::IANG, PinyinFuzzyFlag::IAN_IANG},
507             {PinyinFinal::IN, PinyinFinal::ING, PinyinFuzzyFlag::IN_ING},
508             {PinyinFinal::U, PinyinFinal::OU, PinyinFuzzyFlag::U_OU},
509             {PinyinFinal::UAN, PinyinFinal::UANG, PinyinFuzzyFlag::UAN_UANG},
510             {PinyinFinal::VE, PinyinFinal::UE, PinyinFuzzyFlag::VE_UE},
511         };
512 
513     for (const auto &finalFuzzy : finalFuzzies) {
514         if ((syl.final() == std::get<0>(finalFuzzy) ||
515              syl.final() == std::get<1>(finalFuzzy)) &&
516             flags & std::get<2>(finalFuzzy)) {
517             finals.push_back(syl.final() == std::get<0>(finalFuzzy)
518                                  ? std::get<1>(finalFuzzy)
519                                  : std::get<0>(finalFuzzy));
520             break;
521         }
522     }
523 
524     // "aeo"
525 
526     const static std::vector<std::tuple<PinyinFinal, PinyinFinal>>
527         partialFinals = {
528             {PinyinFinal::A, PinyinFinal::AN},
529             {PinyinFinal::A, PinyinFinal::ANG},
530             {PinyinFinal::A, PinyinFinal::AI},
531             {PinyinFinal::A, PinyinFinal::AO},
532             {PinyinFinal::E, PinyinFinal::EI},
533             {PinyinFinal::E, PinyinFinal::EN},
534             {PinyinFinal::E, PinyinFinal::ENG},
535             {PinyinFinal::E, PinyinFinal::ER},
536             {PinyinFinal::O, PinyinFinal::OU},
537             {PinyinFinal::O, PinyinFinal::ONG},
538         };
539     if (initials.size() == 1 && initials[0] == PinyinInitial::Zero &&
540         flags.test(PinyinFuzzyFlag::PartialFinal)) {
541         for (const auto &partialFinal : partialFinals) {
542             if (syl.final() == std::get<0>(partialFinal)) {
543                 finals.push_back(std::get<1>(partialFinal));
544             }
545         }
546     }
547 
548     for (size_t i = 0; i < initials.size(); i++) {
549         for (size_t j = 0; j < finals.size(); j++) {
550             auto initial = initials[i];
551             auto final = finals[j];
552             if ((i == 0 && j == 0) || final == PinyinFinal::Invalid ||
553                 PinyinEncoder::isValidInitialFinal(initial, final)) {
554                 auto iter = std::find_if(
555                     syls.begin(), syls.end(),
556                     [initial](const auto &p) { return p.first == initial; });
557                 if (iter == syls.end()) {
558                     syls.emplace_back(std::piecewise_construct,
559                                       std::forward_as_tuple(initial),
560                                       std::forward_as_tuple());
561                     iter = std::prev(syls.end());
562                 }
563                 auto &finals = iter->second;
564                 if (std::find_if(finals.begin(), finals.end(),
565                                  [final](auto &p) {
566                                      return p.first == final;
567                                  }) == finals.end()) {
568                     finals.emplace_back(final, i > 0 || j > 0);
569                 }
570             }
571         }
572     }
573 }
574 
575 MatchedPinyinSyllables
stringToSyllables(std::string_view pinyinView,PinyinFuzzyFlags flags)576 PinyinEncoder::stringToSyllables(std::string_view pinyinView,
577                                  PinyinFuzzyFlags flags) {
578     std::vector<
579         std::pair<PinyinInitial, std::vector<std::pair<PinyinFinal, bool>>>>
580         result;
581     std::string pinyin(pinyinView);
582     std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(),
583                    fcitx::charutils::tolower);
584     const auto &map = getPinyinMap();
585     // we only want {M,N,R}/Invalid instead of {M,N,R}/Zero, so we could get
586     // match for everything.
587     if (pinyin != "m" && pinyin != "n" && pinyin != "r") {
588         auto iterPair = map.equal_range(pinyin);
589         for (const auto &item :
590              boost::make_iterator_range(iterPair.first, iterPair.second)) {
591             if (flags.test(item.flags())) {
592                 getFuzzy(result, {item.initial(), item.final()}, flags);
593             }
594         }
595     }
596 
597     auto iter = initialMap.right.find(std::string{pinyin});
598     if (initialMap.right.end() != iter) {
599         getFuzzy(result, {iter->second, PinyinFinal::Invalid}, flags);
600     }
601 
602     if (result.empty()) {
603         result.emplace_back(
604             std::piecewise_construct,
605             std::forward_as_tuple(PinyinInitial::Invalid),
606             std::forward_as_tuple(1,
607                                   std::make_pair(PinyinFinal::Invalid, false)));
608     }
609 
610 #if 0
611     else {
612         // replace invalid
613         for (auto &p : result) {
614             if (p.second.size() == 1 && p.second[0] == PinyinFinal::Invalid) {
615                 p.second.clear();
616                 for (char test = PinyinEncoder::firstFinal;
617                      test <= PinyinEncoder::lastFinal; test++) {
618                     auto final = static_cast<PinyinFinal>(test);
619                     if (PinyinEncoder::isValidInitialFinal(p.first, final)) {
620                         p.second.push_back(final);
621                     }
622                 }
623             }
624         }
625     }
626 #endif
627 
628     return result;
629 }
630 
631 MatchedPinyinSyllables
shuangpinToSyllables(std::string_view pinyinView,const ShuangpinProfile & sp,PinyinFuzzyFlags flags)632 PinyinEncoder::shuangpinToSyllables(std::string_view pinyinView,
633                                     const ShuangpinProfile &sp,
634                                     PinyinFuzzyFlags flags) {
635     assert(pinyinView.size() <= 2);
636     std::string pinyin(pinyinView);
637     std::transform(pinyin.begin(), pinyin.end(), pinyin.begin(),
638                    fcitx::charutils::tolower);
639     const auto &table = sp.table();
640     auto iter = table.find(pinyin);
641 
642     // Don't match partial final if our shuangpin is full size.
643     if (pinyinView.size() > 1) {
644         // This option is somewhat meaningless in full Shuangpin.
645         flags = flags.unset(PinyinFuzzyFlag::PartialFinal);
646     }
647 
648     std::vector<
649         std::pair<PinyinInitial, std::vector<std::pair<PinyinFinal, bool>>>>
650         result;
651     if (iter != table.end()) {
652         for (const auto &p : iter->second) {
653             if (flags.test(p.second)) {
654                 getFuzzy(result, {p.first.initial(), p.first.final()}, flags);
655             }
656         }
657     }
658 
659     if (result.empty()) {
660         result.emplace_back(
661             std::piecewise_construct,
662             std::forward_as_tuple(PinyinInitial::Invalid),
663             std::forward_as_tuple(1,
664                                   std::make_pair(PinyinFinal::Invalid, false)));
665     }
666 
667     return result;
668 }
669 
670 std::string
shuangpinToPinyin(std::string_view pinyinView,const libime::ShuangpinProfile & sp)671 PinyinEncoder::shuangpinToPinyin(std::string_view pinyinView,
672                                  const libime::ShuangpinProfile &sp) {
673     assert(pinyinView.size() <= 2);
674     auto syls = shuangpinToSyllables(pinyinView, sp, PinyinFuzzyFlag::None);
675     if (!syls.empty() && !syls[0].second.empty() && !syls[0].second[0].second) {
676         auto initial = syls[0].first;
677         auto final = syls[0].second[0].first;
678         return initialToString(initial) + finalToString(final);
679     }
680     return "";
681 }
682 
683 } // namespace libime
684