1 /*
2 * SPDX-FileCopyrightText: 2017~2017 CSSlayer <wengxt@gmail.com>
3 *
4 * SPDX-License-Identifier: LGPL-2.1-or-later
5 *
6 */
7 #include "jyutpingencoder.h"
8 #include "jyutpingdata.h"
9 #include <boost/algorithm/string.hpp>
10 #include <boost/bimap.hpp>
11 #include <queue>
12
13 namespace libime {
14 namespace jyutping {
15
16 static const std::string emptyString;
17
18 template <typename L, typename R>
19 boost::bimap<L, R>
makeBimap(std::initializer_list<typename boost::bimap<L,R>::value_type> list)20 makeBimap(std::initializer_list<typename boost::bimap<L, R>::value_type> list) {
21 return boost::bimap<L, R>(list.begin(), list.end());
22 }
23
24 static const auto initialMap = makeBimap<JyutpingInitial, std::string>({
25 {JyutpingInitial::B, "b"}, {JyutpingInitial::P, "p"},
26 {JyutpingInitial::M, "m"}, {JyutpingInitial::F, "f"},
27 {JyutpingInitial::D, "d"}, {JyutpingInitial::T, "t"},
28 {JyutpingInitial::N, "n"}, {JyutpingInitial::L, "l"},
29 {JyutpingInitial::G, "g"}, {JyutpingInitial::K, "k"},
30 {JyutpingInitial::NG, "ng"}, {JyutpingInitial::H, "h"},
31 {JyutpingInitial::GW, "gw"}, {JyutpingInitial::KW, "kw"},
32 {JyutpingInitial::W, "gw"}, {JyutpingInitial::W, "w"},
33 {JyutpingInitial::Z, "z"}, {JyutpingInitial::C, "c"},
34 {JyutpingInitial::S, "s"}, {JyutpingInitial::J, "j"},
35 {JyutpingInitial::Zero, ""},
36 });
37
38 static const auto finalMap = makeBimap<JyutpingFinal, std::string>({
39 {JyutpingFinal::AA, "aa"}, {JyutpingFinal::AAI, "aai"},
40 {JyutpingFinal::AAU, "aau"}, {JyutpingFinal::AAM, "aam"},
41 {JyutpingFinal::AAN, "aan"}, {JyutpingFinal::AANG, "aang"},
42 {JyutpingFinal::AAP, "aap"}, {JyutpingFinal::AAT, "aat"},
43 {JyutpingFinal::AAK, "aak"}, {JyutpingFinal::AI, "ai"},
44 {JyutpingFinal::AU, "au"}, {JyutpingFinal::AM, "am"},
45 {JyutpingFinal::AN, "an"}, {JyutpingFinal::ANG, "ang"},
46 {JyutpingFinal::AP, "ap"}, {JyutpingFinal::AT, "at"},
47 {JyutpingFinal::AK, "ak"}, {JyutpingFinal::E, "e"},
48 {JyutpingFinal::EI, "ei"}, {JyutpingFinal::ET, "et"},
49 {JyutpingFinal::EU, "eu"}, {JyutpingFinal::EM, "em"},
50 {JyutpingFinal::EN, "en"}, {JyutpingFinal::ENG, "eng"},
51 {JyutpingFinal::EP, "ep"}, {JyutpingFinal::EK, "ek"},
52 {JyutpingFinal::I, "i"}, {JyutpingFinal::IU, "iu"},
53 {JyutpingFinal::IM, "im"}, {JyutpingFinal::IN, "in"},
54 {JyutpingFinal::ING, "ing"}, {JyutpingFinal::IP, "ip"},
55 {JyutpingFinal::IT, "it"}, {JyutpingFinal::IK, "ik"},
56 {JyutpingFinal::O, "o"}, {JyutpingFinal::OI, "oi"},
57 {JyutpingFinal::OU, "ou"}, {JyutpingFinal::OM, "om"},
58 {JyutpingFinal::ON, "on"}, {JyutpingFinal::ONG, "ong"},
59 {JyutpingFinal::OT, "ot"}, {JyutpingFinal::OK, "ok"},
60 {JyutpingFinal::OE, "oe"}, {JyutpingFinal::OENG, "oeng"},
61 {JyutpingFinal::OEK, "oek"}, {JyutpingFinal::EOI, "eoi"},
62 {JyutpingFinal::EON, "eon"}, {JyutpingFinal::EOT, "eot"},
63 {JyutpingFinal::U, "u"}, {JyutpingFinal::UI, "ui"},
64 {JyutpingFinal::UN, "un"}, {JyutpingFinal::UNG, "ung"},
65 {JyutpingFinal::UT, "ut"}, {JyutpingFinal::UK, "uk"},
66 {JyutpingFinal::YU, "yu"}, {JyutpingFinal::YUN, "yun"},
67 {JyutpingFinal::YUT, "yut"}, {JyutpingFinal::M, "m"},
68 {JyutpingFinal::NG, "ng"}, {JyutpingFinal::Zero, ""},
69 });
70
71 static const int maxJyutpingLength = 6;
72
73 template <typename Iter>
longestMatch(Iter iter,Iter end)74 std::pair<std::string_view, bool> longestMatch(Iter iter, Iter end) {
75 if (std::distance(iter, end) > maxJyutpingLength) {
76 end = iter + maxJyutpingLength;
77 }
78 auto range = std::string_view(&*iter, std::distance(iter, end));
79 auto &map = getJyutpingMap();
80 for (; range.size(); range.remove_suffix(1)) {
81 auto iterPair = map.equal_range(range);
82 if (iterPair.first != iterPair.second) {
83 // do not consider m/ng as complete jyutping
84 return std::make_pair(range, (range != "m" && range != "ng"));
85 }
86 if (range.size() <= 2) {
87 auto iter = initialMap.right.find(std::string{range});
88 if (iter != initialMap.right.end()) {
89 return std::make_pair(range, false);
90 }
91 }
92 }
93
94 if (!range.size()) {
95 range = std::string_view(&*iter, 1);
96 }
97
98 return std::make_pair(range, false);
99 }
100
toString() const101 std::string JyutpingSyllable::toString() const {
102 return JyutpingEncoder::initialToString(initial_) +
103 JyutpingEncoder::finalToString(final_);
104 }
105
parseUserJyutping(std::string userJyutping,bool inner)106 SegmentGraph JyutpingEncoder::parseUserJyutping(std::string userJyutping,
107 bool inner) {
108 SegmentGraph result(std::move(userJyutping));
109 const auto &jyutping = result.data();
110 auto end = jyutping.end();
111 std::priority_queue<size_t, std::vector<size_t>, std::greater<size_t>> q;
112 q.push(0);
113 while (q.size()) {
114 size_t top;
115 do {
116 top = q.top();
117 q.pop();
118 } while (q.size() && q.top() == top);
119 if (top >= jyutping.size()) {
120 continue;
121 }
122 auto iter = std::next(jyutping.begin(), top);
123 if (*iter == '\'') {
124 while (*iter == '\'' && iter != jyutping.end()) {
125 iter++;
126 }
127 auto next = std::distance(jyutping.begin(), iter);
128 result.addNext(top, next);
129 if (static_cast<size_t>(next) < jyutping.size()) {
130 q.push(next);
131 }
132 continue;
133 }
134 std::string_view str;
135 bool isCompleteJyutping;
136 std::tie(str, isCompleteJyutping) = longestMatch(iter, end);
137
138 // it's not complete a jyutping, no need to try
139 if (!isCompleteJyutping) {
140 result.addNext(top, top + str.size());
141 q.push(top + str.size());
142 } else {
143 // check fuzzy seg
144 // jyutping may end with aegikmnoptu
145 // and may start with abcdefghjklmnopstuwz.
146 // the intersection is aegkmnoptu.
147 // also, make sure current jyutping does not end with a separator.
148 auto &map = getJyutpingMap();
149 std::array<size_t, 2> nextSize;
150 size_t nNextSize = 0;
151 if (str.size() > 1 && top + str.size() < jyutping.size() &&
152 jyutping[top + str.size()] != '\'' &&
153 (str.back() == 'a' || str.back() == 'e' || str.back() == 'g' ||
154 str.back() == 'k' || str.back() == 'm' || str.back() == 'n' ||
155 str.back() == 'o' || str.back() == 'p' || str.back() == 't' ||
156 str.back() == 'u') &&
157 map.find(str.substr(0, str.size() - 1)) != map.end()) {
158 // str[0:-1] is also a full jyutping, check next jyutping
159 auto nextMatch = longestMatch(iter + str.size(), end);
160 auto nextMatchAlt = longestMatch(iter + str.size() - 1, end);
161 auto matchSize = str.size() + nextMatch.first.size();
162 auto matchSizeAlt = str.size() - 1 + nextMatchAlt.first.size();
163 if (std::make_pair(matchSize, nextMatch.second) >=
164 std::make_pair(matchSizeAlt, nextMatchAlt.second)) {
165 result.addNext(top, top + str.size());
166 q.push(top + str.size());
167 nextSize[nNextSize++] = str.size();
168 }
169 if (std::make_pair(matchSize, nextMatch.second) <=
170 std::make_pair(matchSizeAlt, nextMatchAlt.second)) {
171 result.addNext(top, top + str.size() - 1);
172 q.push(top + str.size() - 1);
173 nextSize[nNextSize++] = str.size() - 1;
174 }
175 } else {
176 result.addNext(top, top + str.size());
177 q.push(top + str.size());
178 nextSize[nNextSize++] = str.size();
179 }
180
181 for (size_t i = 0; i < nNextSize; i++) {
182 if (nextSize[i] >= 4 && inner) {
183 auto &innerSegments = getInnerSegment();
184 auto iter = innerSegments.find(
185 std::string{str.substr(0, nextSize[i])});
186 if (iter != innerSegments.end()) {
187 result.addNext(top, top + iter->second.first.size());
188 result.addNext(top + iter->second.first.size(),
189 top + nextSize[i]);
190 }
191 }
192 }
193 }
194 }
195 return result;
196 }
197
encodeOneUserJyutping(std::string jyutping)198 std::vector<char> JyutpingEncoder::encodeOneUserJyutping(std::string jyutping) {
199 if (jyutping.empty()) {
200 return {};
201 }
202 auto graph = parseUserJyutping(std::move(jyutping), false);
203 std::vector<char> result;
204 const SegmentGraphNode *node = &graph.start(), *prev = nullptr;
205 while (node->nextSize()) {
206 prev = node;
207 node = &node->nexts().front();
208 auto seg = graph.segment(*prev, *node);
209 if (seg.empty() || seg[0] == '\'') {
210 continue;
211 }
212 auto syls = stringToSyllables(seg);
213 if (!syls.size()) {
214 return {};
215 }
216 result.push_back(static_cast<char>(syls[0].first));
217 result.push_back(static_cast<char>(syls[0].second[0].first));
218 }
219 return result;
220 }
221
isValidUserJyutping(const char * data,size_t size)222 bool JyutpingEncoder::isValidUserJyutping(const char *data, size_t size) {
223 if (size % 2 != 0) {
224 return false;
225 }
226
227 for (size_t i = 0; i < size / 2; i++) {
228 if (!JyutpingEncoder::isValidInitial(data[i * 2])) {
229 return false;
230 }
231 }
232 return true;
233 }
234
decodeFullJyutping(const char * data,size_t size)235 std::string JyutpingEncoder::decodeFullJyutping(const char *data, size_t size) {
236 if (size % 2 != 0) {
237 throw std::invalid_argument("invalid jyutping key");
238 }
239 std::string result;
240 for (size_t i = 0, e = size / 2; i < e; i++) {
241 if (i) {
242 result += '\'';
243 }
244 result += initialToString(static_cast<JyutpingInitial>(data[i * 2]));
245 result += finalToString(static_cast<JyutpingFinal>(data[i * 2 + 1]));
246 }
247 return result;
248 }
249
initialToString(JyutpingInitial initial)250 const std::string &JyutpingEncoder::initialToString(JyutpingInitial initial) {
251 const static std::vector<std::string> s = []() {
252 std::vector<std::string> s;
253 s.resize(lastInitial - firstInitial + 1);
254 for (char c = firstInitial; c <= lastInitial; c++) {
255 auto iter = initialMap.left.find(static_cast<JyutpingInitial>(c));
256 s[c - firstInitial] = iter->second;
257 }
258 return s;
259 }();
260 auto c = static_cast<char>(initial);
261 if (c >= firstInitial && c <= lastInitial) {
262 return s[c - firstInitial];
263 }
264 return emptyString;
265 }
266
stringToInitial(const std::string & str)267 JyutpingInitial JyutpingEncoder::stringToInitial(const std::string &str) {
268 auto iter = initialMap.right.find(str);
269 if (iter != initialMap.right.end()) {
270 return iter->second;
271 }
272 return JyutpingInitial::Invalid;
273 }
274
finalToString(JyutpingFinal final)275 const std::string &JyutpingEncoder::finalToString(JyutpingFinal final) {
276 const static std::vector<std::string> s = []() {
277 std::vector<std::string> s;
278 s.resize(lastFinal - firstFinal + 1);
279 for (char c = firstFinal; c <= lastFinal; c++) {
280 auto iter = finalMap.left.find(static_cast<JyutpingFinal>(c));
281 s[c - firstFinal] = iter->second;
282 }
283 return s;
284 }();
285 auto c = static_cast<char>(final);
286 if (c >= firstFinal && c <= lastFinal) {
287 return s[c - firstFinal];
288 }
289 return emptyString;
290 }
291
stringToFinal(const std::string & str)292 JyutpingFinal JyutpingEncoder::stringToFinal(const std::string &str) {
293 auto iter = finalMap.right.find(str);
294 if (iter != finalMap.right.end()) {
295 return iter->second;
296 }
297 return JyutpingFinal::Invalid;
298 }
299
isValidInitialFinal(JyutpingInitial initial,JyutpingFinal final)300 bool JyutpingEncoder::isValidInitialFinal(JyutpingInitial initial,
301 JyutpingFinal final) {
302 if (initial != JyutpingInitial::Invalid &&
303 final != JyutpingFinal::Invalid) {
304 int16_t encode =
305 ((static_cast<int16_t>(initial) - JyutpingEncoder::firstInitial) *
306 (JyutpingEncoder::lastFinal - JyutpingEncoder::firstFinal + 1)) +
307 (static_cast<int16_t>(final) - JyutpingEncoder::firstFinal);
308 const auto &a = getEncodedInitialFinal();
309 return encode < static_cast<int>(a.size()) && a[encode];
310 }
311 return false;
312 }
313
getFuzzy(std::vector<std::pair<JyutpingInitial,std::vector<std::pair<JyutpingFinal,bool>>>> & syls,JyutpingSyllable syl)314 static void getFuzzy(
315 std::vector<std::pair<JyutpingInitial,
316 std::vector<std::pair<JyutpingFinal, bool>>>> &syls,
317 JyutpingSyllable syl) {
318 JyutpingInitial initials[2] = {syl.initial(), JyutpingInitial::Invalid};
319 JyutpingFinal finals[2] = {syl.final(), JyutpingFinal::Invalid};
320 int initialSize = 1;
321 int finalSize = 1;
322
323 for (int i = 0; i < initialSize; i++) {
324 for (int j = 0; j < finalSize; j++) {
325 auto initial = initials[i];
326 auto final = finals[j];
327 if ((i == 0 && j == 0) || final == JyutpingFinal::Invalid ||
328 JyutpingEncoder::isValidInitialFinal(initial, final)) {
329 auto iter = std::find_if(
330 syls.begin(), syls.end(),
331 [initial](const auto &p) { return p.first == initial; });
332 if (iter == syls.end()) {
333 syls.emplace_back(std::piecewise_construct,
334 std::forward_as_tuple(initial),
335 std::forward_as_tuple());
336 iter = std::prev(syls.end());
337 }
338 auto &finals = iter->second;
339 if (std::find_if(finals.begin(), finals.end(),
340 [final](auto &p) {
341 return p.first == final;
342 }) == finals.end()) {
343 finals.emplace_back(final, i > 0 || j > 0);
344 }
345 }
346 }
347 }
348 }
349
350 MatchedJyutpingSyllables
stringToSyllables(std::string_view jyutping)351 JyutpingEncoder::stringToSyllables(std::string_view jyutping) {
352 std::vector<
353 std::pair<JyutpingInitial, std::vector<std::pair<JyutpingFinal, bool>>>>
354 result;
355 auto &map = getJyutpingMap();
356 // we only want {M,N,R}/Invalid instead of {M,N,R}/Zero, so we could get
357 // match for everything.
358 if (jyutping != "m" && jyutping != "ng") {
359 auto iterPair = map.equal_range(jyutping);
360 for (auto &item :
361 boost::make_iterator_range(iterPair.first, iterPair.second)) {
362 getFuzzy(result, {item.initial(), item.final()});
363 }
364 }
365
366 auto iter = initialMap.right.find(std::string{jyutping});
367 if (initialMap.right.end() != iter) {
368 getFuzzy(result, {iter->second, JyutpingFinal::Invalid});
369 }
370
371 if (result.size() == 0) {
372 result.emplace_back(
373 std::piecewise_construct,
374 std::forward_as_tuple(JyutpingInitial::Invalid),
375 std::forward_as_tuple(
376 1, std::make_pair(JyutpingFinal::Invalid, false)));
377 }
378
379 return result;
380 }
381
382 std::vector<char>
encodeFullJyutping(std::string_view jyutping)383 JyutpingEncoder::encodeFullJyutping(std::string_view jyutping) {
384 std::vector<std::string> jyutpings;
385 boost::split(jyutpings, jyutping, boost::is_any_of("'"));
386 std::vector<char> result;
387 result.resize(jyutpings.size() * 2);
388 int idx = 0;
389 for (const auto &singleJyutping : jyutpings) {
390 auto &map = getJyutpingMap();
391 auto iter = map.find(singleJyutping);
392 if (iter == map.end()) {
393 throw std::invalid_argument("invalid full jyutping: " +
394 std::string{jyutping});
395 }
396 result[idx++] = static_cast<char>(iter->initial());
397 result[idx++] = static_cast<char>(iter->final());
398 }
399
400 return result;
401 }
402
403 } // namespace jyutping
404 } // namespace libime
405