1 /*
2 * SPDX-FileCopyrightText: 2012-2012 Yichao Yu <yyc1992@gmail.com>
3 * SPDX-FileCopyrightText: 2017-2017 CSSlayer <wengxt@gmail.com>
4 *
5 * SPDX-License-Identifier: LGPL-2.1-or-later
6 *
7 */
8
9 #include "spell-custom-dict.h"
10 #include <fcntl.h>
11 #include <sys/stat.h>
12 #include <stdexcept>
13 #include "fcitx-utils/cutf8.h"
14 #include "fcitx-utils/endian_p.h"
15 #include "fcitx-utils/fs.h"
16 #include "fcitx-utils/standardpath.h"
17
18 #define case_a_z \
19 case 'a': \
20 case 'b': \
21 case 'c': \
22 case 'd': \
23 case 'e': \
24 case 'f': \
25 case 'g': \
26 case 'h': \
27 case 'i': \
28 case 'j': \
29 case 'k': \
30 case 'l': \
31 case 'm': \
32 case 'n': \
33 case 'o': \
34 case 'p': \
35 case 'q': \
36 case 'r': \
37 case 's': \
38 case 't': \
39 case 'u': \
40 case 'v': \
41 case 'w': \
42 case 'x': \
43 case 'y': \
44 case 'z'
45
46 #define case_A_Z \
47 case 'A': \
48 case 'B': \
49 case 'C': \
50 case 'D': \
51 case 'E': \
52 case 'F': \
53 case 'G': \
54 case 'H': \
55 case 'I': \
56 case 'J': \
57 case 'K': \
58 case 'L': \
59 case 'M': \
60 case 'N': \
61 case 'O': \
62 case 'P': \
63 case 'Q': \
64 case 'R': \
65 case 'S': \
66 case 'T': \
67 case 'U': \
68 case 'V': \
69 case 'W': \
70 case 'X': \
71 case 'Y': \
72 case 'Z'
73
74 #define DICT_BIN_MAGIC "FSCD0000"
75
checkLang(const std::string & full_lang,const std::string & lang)76 bool checkLang(const std::string &full_lang, const std::string &lang) {
77 if (full_lang.empty() || lang.empty()) {
78 return false;
79 }
80 if (full_lang.compare(0, lang.size(), lang) != 0) {
81 return false;
82 }
83 switch (full_lang[lang.size()]) {
84 case '\0':
85 case '_':
86 return true;
87 default:
88 break;
89 }
90 return false;
91 }
92
load_le32(const void * p)93 static inline uint32_t load_le32(const void *p) {
94 return le32toh(*(uint32_t *)p);
95 }
96
isFirstCapital(const std::string & str)97 static bool isFirstCapital(const std::string &str) {
98 if (str.empty()) {
99 return false;
100 }
101 auto iter = str.begin();
102 switch (*iter) {
103 case_A_Z:
104 break;
105 default:
106 return false;
107 }
108 ++iter;
109 for (; iter != str.end(); iter++) {
110 switch (*iter) {
111 case_A_Z:
112 return false;
113 default:
114 continue;
115 }
116 }
117 return true;
118 }
119
isAllCapital(const std::string & str)120 static bool isAllCapital(const std::string &str) {
121 if (str.empty()) {
122 return false;
123 }
124 for (auto iter = str.begin(); iter != str.end(); iter++) {
125 switch (*iter) {
126 case_a_z:
127 return false;
128 default:
129 continue;
130 }
131 }
132 return true;
133 }
134
135 enum {
136 CUSTOM_DEFAULT,
137 CUSTOM_FIRST_CAPITAL,
138 CUSTOM_ALL_CAPITAL,
139 };
140
toUpperString(std::string & str)141 static void toUpperString(std::string &str) {
142 if (str.empty()) {
143 return;
144 }
145 for (auto iter = str.begin(); iter != str.end(); iter++) {
146 switch (*iter) {
147 case_a_z:
148 *iter += 'A' - 'a';
149 break;
150 default:
151 break;
152 }
153 }
154 }
155
156 namespace fcitx {
157
158 class SpellCustomDictEn : public SpellCustomDict {
159 public:
SpellCustomDictEn()160 SpellCustomDictEn() {
161 delim_ = " _-,./?!%";
162 loadDict("en");
163 }
164
wordCompare(unsigned int c1,unsigned int c2)165 bool wordCompare(unsigned int c1, unsigned int c2) override {
166 switch (c1) {
167 case_A_Z:
168 c1 += 'a' - 'A';
169 break;
170 case_a_z:
171 break;
172 default:
173 return c1 == c2;
174 }
175 switch (c2) {
176 case_A_Z:
177 c2 += 'a' - 'A';
178 break;
179 }
180 return c1 == c2;
181 }
wordCheck(const std::string & str)182 int wordCheck(const std::string &str) override {
183 if (isFirstCapital(str)) {
184 return CUSTOM_FIRST_CAPITAL;
185 }
186 if (isAllCapital(str)) {
187 return CUSTOM_ALL_CAPITAL;
188 }
189 return CUSTOM_DEFAULT;
190 }
191
hintComplete(std::vector<std::string> & hints,int type)192 void hintComplete(std::vector<std::string> &hints, int type) override {
193 switch (type) {
194 case CUSTOM_ALL_CAPITAL:
195 for (auto &hint : hints) {
196 toUpperString(hint);
197 }
198 break;
199 case CUSTOM_FIRST_CAPITAL:
200 for (auto &hint : hints) {
201 if (hint.empty()) {
202 continue;
203 }
204 switch (hint[0]) {
205 case_a_z:
206 hint[0] += 'A' - 'a';
207 break;
208 default:
209 break;
210 }
211 }
212 default:
213 break;
214 }
215 }
216 };
217
218 #if 0
219 static inline uint16_t
220 load_le16(const void* p)
221 {
222 return le16toh(*(uint16_t*)p);
223 }
224 #endif
225
226 /**
227 // Open the dict file, return -1 if failed.
228 **/
locateDictFile(const std::string & lang)229 std::string SpellCustomDict::locateDictFile(const std::string &lang) {
230 auto templatePath = "spell/" + lang + "_dict.fscd";
231 const auto &standardPath = StandardPath::global();
232 std::string path;
233 standardPath.scanDirectories(
234 StandardPath::Type::PkgData,
235 [&path, &templatePath](const std::string &dirPath, bool isUser) {
236 if (isUser) {
237 return true;
238 }
239 auto fullPath = stringutils::joinPath(dirPath, templatePath);
240 if (fs::isreg(fullPath)) {
241 path = fullPath;
242 return false;
243 }
244 return true;
245 });
246 return path;
247 }
248
loadDict(const std::string & lang)249 void SpellCustomDict::loadDict(const std::string &lang) {
250 auto file = locateDictFile(lang);
251 auto fd = UnixFD::own(open(file.c_str(), O_RDONLY));
252
253 if (!fd.isValid()) {
254 throw std::runtime_error("failed to open dict file");
255 }
256
257 do {
258 struct stat stat_buf;
259 size_t total_len;
260 char magic_buff[sizeof(DICT_BIN_MAGIC) - 1];
261 if (fstat(fd.fd(), &stat_buf) == -1 ||
262 static_cast<size_t>(stat_buf.st_size) <=
263 sizeof(uint32_t) + sizeof(magic_buff)) {
264 break;
265 }
266 if (fs::safeRead(fd.fd(), magic_buff, sizeof(magic_buff)) !=
267 sizeof(magic_buff)) {
268 break;
269 }
270 if (memcmp(DICT_BIN_MAGIC, magic_buff, sizeof(magic_buff)) != 0) {
271 break;
272 }
273 total_len = stat_buf.st_size - sizeof(magic_buff);
274 data_.resize(total_len + 1);
275 if (fs::safeRead(fd.fd(), data_.data(), total_len) !=
276 static_cast<ssize_t>(total_len)) {
277 break;
278 }
279 data_[total_len] = '\0';
280
281 auto lcount = load_le32(data_.data());
282 words_.resize(lcount);
283
284 /* save words offset's. */
285 size_t i, j;
286 for (i = sizeof(uint32_t), j = 0; i < total_len && j < lcount; i += 1) {
287 i += sizeof(uint16_t);
288 int l = strlen(data_.data() + i);
289 if (!l) {
290 continue;
291 }
292 words_[j++] = i;
293 i += l;
294 }
295 if (j < lcount || i < total_len) {
296 break;
297 }
298 return;
299 } while (0);
300
301 throw std::runtime_error("failed to read dict file");
302 }
303
requestDict(const std::string & lang)304 SpellCustomDict *SpellCustomDict::requestDict(const std::string &lang) {
305 if (checkLang(lang, "en")) {
306 return new SpellCustomDictEn;
307 }
308 return nullptr;
309 }
310
checkDict(const std::string & lang)311 bool SpellCustomDict::checkDict(const std::string &lang) {
312 return !locateDictFile(lang).empty();
313 }
314
getDistance(const char * word,int utf8Len,const char * dict)315 int SpellCustomDict::getDistance(const char *word, int utf8Len,
316 const char *dict) {
317 #define REPLACE_WEIGHT 3
318 #define INSERT_WEIGHT 3
319 #define REMOVE_WEIGHT 3
320 #define END_WEIGHT 1
321 /*
322 * three kinds of error, replace, insert and remove
323 * replace means apple vs aplle
324 * insert means apple vs applee
325 * remove means apple vs aple
326 *
327 * each error need to follow a correct match.
328 *
329 * number of "remove error" shoud be no more than "maxremove"
330 * while maxremove equals to (length - 2) / 3
331 *
332 * and the total error number should be no more than "maxdiff"
333 * while maxdiff equales to length / 3.
334 */
335 int replace = 0;
336 int insert = 0;
337 int remove = 0;
338 int diff = 0;
339 int maxdiff;
340 int maxremove;
341 unsigned int cur_word_c;
342 unsigned int cur_dict_c;
343 unsigned int next_word_c;
344 unsigned int next_dict_c;
345 maxdiff = utf8Len / 3;
346 maxremove = (utf8Len - 2) / 3;
347 word = fcitx_utf8_get_char(word, &cur_word_c);
348 dict = fcitx_utf8_get_char(dict, &cur_dict_c);
349 while ((diff = replace + insert + remove) <= maxdiff &&
350 remove <= maxremove) {
351 /*
352 * cur_word_c and cur_dict_c are the current characters
353 * and dict and word are pointing to the next one.
354 */
355 if (!cur_word_c) {
356 return ((replace * REPLACE_WEIGHT + insert * INSERT_WEIGHT +
357 remove * REMOVE_WEIGHT) +
358 (cur_dict_c
359 ? (static_cast<int>(fcitx_utf8_strlen(dict)) + 1) *
360 END_WEIGHT
361 : 0));
362 }
363 word = fcitx_utf8_get_char(word, &next_word_c);
364
365 /* check remove error */
366 if (!cur_dict_c) {
367 if (next_word_c) {
368 return -1;
369 }
370 remove++;
371 if (diff <= maxdiff && remove <= maxremove) {
372 return (replace * REPLACE_WEIGHT + insert * INSERT_WEIGHT +
373 remove * REMOVE_WEIGHT);
374 }
375 return -1;
376 }
377 dict = fcitx_utf8_get_char(dict, &next_dict_c);
378 if (cur_word_c == cur_dict_c || (wordCompare(cur_word_c, cur_dict_c))) {
379 cur_word_c = next_word_c;
380 cur_dict_c = next_dict_c;
381 continue;
382 }
383 if (next_word_c == cur_dict_c ||
384 (next_word_c && wordCompare(next_word_c, cur_dict_c))) {
385 word = fcitx_utf8_get_char(word, &cur_word_c);
386 cur_dict_c = next_dict_c;
387 remove++;
388 continue;
389 }
390
391 /* check insert error */
392 if (cur_word_c == next_dict_c ||
393 (next_dict_c && wordCompare(cur_word_c, next_dict_c))) {
394 cur_word_c = next_word_c;
395 dict = fcitx_utf8_get_char(dict, &cur_dict_c);
396 insert++;
397 continue;
398 }
399
400 /* check replace error */
401 if (next_word_c == next_dict_c ||
402 (next_word_c && next_dict_c &&
403 wordCompare(next_word_c, next_dict_c))) {
404 if (next_word_c) {
405 dict = fcitx_utf8_get_char(dict, &cur_dict_c);
406 word = fcitx_utf8_get_char(word, &cur_word_c);
407 } else {
408 cur_word_c = 0;
409 cur_dict_c = 0;
410 }
411 replace++;
412 continue;
413 }
414 break;
415 }
416 return -1;
417 }
418
hint(const std::string & str,size_t limit)419 std::vector<std::string> SpellCustomDict::hint(const std::string &str,
420 size_t limit) {
421 const char *word = str.c_str();
422 const char *real_word = word;
423 std::vector<std::string> result;
424 std::vector<std::pair<const char *, int>> tops;
425 if (!delim_.empty()) {
426 size_t delta;
427 while (real_word[delta = strcspn(real_word, delim_.c_str())]) {
428 real_word += delta + 1;
429 }
430 }
431 if (!real_word[0]) {
432 return {};
433 }
434 auto word_type = wordCheck(real_word);
435 int word_len = fcitx_utf8_strlen(real_word);
436 auto compare = [](const std::pair<const char *, int> &lhs,
437 const std::pair<const char *, int> &rhs) {
438 return lhs.second < rhs.second;
439 };
440 for (const auto &wordOffset : words_) {
441 int dist;
442 const char *dictWord = data_.data() + wordOffset;
443 if ((dist = getDistance(real_word, word_len, dictWord)) >= 0) {
444 tops.emplace_back(dictWord, dist);
445 std::push_heap(tops.begin(), tops.end(), compare);
446 if (tops.size() > limit) {
447 std::pop_heap(tops.begin(), tops.end(), compare);
448 tops.pop_back();
449 }
450 }
451 }
452
453 // Or sort heap?..
454 std::sort(tops.begin(), tops.end(), compare);
455
456 result.reserve(tops.size());
457 for (auto &top : tops) {
458 result.emplace_back(top.first);
459 }
460 hintComplete(result, word_type);
461 return result;
462 }
463 } // namespace fcitx
464