1 /*
2  * SPDX-FileCopyrightText: 2012-2012 Yichao Yu <yyc1992@gmail.com>
3  * SPDX-FileCopyrightText: 2017-2017 CSSlayer <wengxt@gmail.com>
4  *
5  * SPDX-License-Identifier: LGPL-2.1-or-later
6  *
7  */
8 
9 #include "spell-custom-dict.h"
10 #include <fcntl.h>
11 #include <sys/stat.h>
12 #include <stdexcept>
13 #include "fcitx-utils/cutf8.h"
14 #include "fcitx-utils/endian_p.h"
15 #include "fcitx-utils/fs.h"
16 #include "fcitx-utils/standardpath.h"
17 
18 #define case_a_z                                                               \
19     case 'a':                                                                  \
20     case 'b':                                                                  \
21     case 'c':                                                                  \
22     case 'd':                                                                  \
23     case 'e':                                                                  \
24     case 'f':                                                                  \
25     case 'g':                                                                  \
26     case 'h':                                                                  \
27     case 'i':                                                                  \
28     case 'j':                                                                  \
29     case 'k':                                                                  \
30     case 'l':                                                                  \
31     case 'm':                                                                  \
32     case 'n':                                                                  \
33     case 'o':                                                                  \
34     case 'p':                                                                  \
35     case 'q':                                                                  \
36     case 'r':                                                                  \
37     case 's':                                                                  \
38     case 't':                                                                  \
39     case 'u':                                                                  \
40     case 'v':                                                                  \
41     case 'w':                                                                  \
42     case 'x':                                                                  \
43     case 'y':                                                                  \
44     case 'z'
45 
46 #define case_A_Z                                                               \
47     case 'A':                                                                  \
48     case 'B':                                                                  \
49     case 'C':                                                                  \
50     case 'D':                                                                  \
51     case 'E':                                                                  \
52     case 'F':                                                                  \
53     case 'G':                                                                  \
54     case 'H':                                                                  \
55     case 'I':                                                                  \
56     case 'J':                                                                  \
57     case 'K':                                                                  \
58     case 'L':                                                                  \
59     case 'M':                                                                  \
60     case 'N':                                                                  \
61     case 'O':                                                                  \
62     case 'P':                                                                  \
63     case 'Q':                                                                  \
64     case 'R':                                                                  \
65     case 'S':                                                                  \
66     case 'T':                                                                  \
67     case 'U':                                                                  \
68     case 'V':                                                                  \
69     case 'W':                                                                  \
70     case 'X':                                                                  \
71     case 'Y':                                                                  \
72     case 'Z'
73 
74 #define DICT_BIN_MAGIC "FSCD0000"
75 
checkLang(const std::string & full_lang,const std::string & lang)76 bool checkLang(const std::string &full_lang, const std::string &lang) {
77     if (full_lang.empty() || lang.empty()) {
78         return false;
79     }
80     if (full_lang.compare(0, lang.size(), lang) != 0) {
81         return false;
82     }
83     switch (full_lang[lang.size()]) {
84     case '\0':
85     case '_':
86         return true;
87     default:
88         break;
89     }
90     return false;
91 }
92 
load_le32(const void * p)93 static inline uint32_t load_le32(const void *p) {
94     return le32toh(*(uint32_t *)p);
95 }
96 
isFirstCapital(const std::string & str)97 static bool isFirstCapital(const std::string &str) {
98     if (str.empty()) {
99         return false;
100     }
101     auto iter = str.begin();
102     switch (*iter) {
103     case_A_Z:
104         break;
105     default:
106         return false;
107     }
108     ++iter;
109     for (; iter != str.end(); iter++) {
110         switch (*iter) {
111         case_A_Z:
112             return false;
113         default:
114             continue;
115         }
116     }
117     return true;
118 }
119 
isAllCapital(const std::string & str)120 static bool isAllCapital(const std::string &str) {
121     if (str.empty()) {
122         return false;
123     }
124     for (auto iter = str.begin(); iter != str.end(); iter++) {
125         switch (*iter) {
126         case_a_z:
127             return false;
128         default:
129             continue;
130         }
131     }
132     return true;
133 }
134 
135 enum {
136     CUSTOM_DEFAULT,
137     CUSTOM_FIRST_CAPITAL,
138     CUSTOM_ALL_CAPITAL,
139 };
140 
toUpperString(std::string & str)141 static void toUpperString(std::string &str) {
142     if (str.empty()) {
143         return;
144     }
145     for (auto iter = str.begin(); iter != str.end(); iter++) {
146         switch (*iter) {
147         case_a_z:
148             *iter += 'A' - 'a';
149             break;
150         default:
151             break;
152         }
153     }
154 }
155 
156 namespace fcitx {
157 
158 class SpellCustomDictEn : public SpellCustomDict {
159 public:
SpellCustomDictEn()160     SpellCustomDictEn() {
161         delim_ = " _-,./?!%";
162         loadDict("en");
163     }
164 
wordCompare(unsigned int c1,unsigned int c2)165     bool wordCompare(unsigned int c1, unsigned int c2) override {
166         switch (c1) {
167         case_A_Z:
168             c1 += 'a' - 'A';
169             break;
170         case_a_z:
171             break;
172         default:
173             return c1 == c2;
174         }
175         switch (c2) {
176         case_A_Z:
177             c2 += 'a' - 'A';
178             break;
179         }
180         return c1 == c2;
181     }
wordCheck(const std::string & str)182     int wordCheck(const std::string &str) override {
183         if (isFirstCapital(str)) {
184             return CUSTOM_FIRST_CAPITAL;
185         }
186         if (isAllCapital(str)) {
187             return CUSTOM_ALL_CAPITAL;
188         }
189         return CUSTOM_DEFAULT;
190     }
191 
hintComplete(std::vector<std::string> & hints,int type)192     void hintComplete(std::vector<std::string> &hints, int type) override {
193         switch (type) {
194         case CUSTOM_ALL_CAPITAL:
195             for (auto &hint : hints) {
196                 toUpperString(hint);
197             }
198             break;
199         case CUSTOM_FIRST_CAPITAL:
200             for (auto &hint : hints) {
201                 if (hint.empty()) {
202                     continue;
203                 }
204                 switch (hint[0]) {
205                 case_a_z:
206                     hint[0] += 'A' - 'a';
207                     break;
208                 default:
209                     break;
210                 }
211             }
212         default:
213             break;
214         }
215     }
216 };
217 
218 #if 0
219 static inline uint16_t
220 load_le16(const void* p)
221 {
222     return le16toh(*(uint16_t*)p);
223 }
224 #endif
225 
226 /**
227 // Open the dict file, return -1 if failed.
228  **/
locateDictFile(const std::string & lang)229 std::string SpellCustomDict::locateDictFile(const std::string &lang) {
230     auto templatePath = "spell/" + lang + "_dict.fscd";
231     const auto &standardPath = StandardPath::global();
232     std::string path;
233     standardPath.scanDirectories(
234         StandardPath::Type::PkgData,
235         [&path, &templatePath](const std::string &dirPath, bool isUser) {
236             if (isUser) {
237                 return true;
238             }
239             auto fullPath = stringutils::joinPath(dirPath, templatePath);
240             if (fs::isreg(fullPath)) {
241                 path = fullPath;
242                 return false;
243             }
244             return true;
245         });
246     return path;
247 }
248 
loadDict(const std::string & lang)249 void SpellCustomDict::loadDict(const std::string &lang) {
250     auto file = locateDictFile(lang);
251     auto fd = UnixFD::own(open(file.c_str(), O_RDONLY));
252 
253     if (!fd.isValid()) {
254         throw std::runtime_error("failed to open dict file");
255     }
256 
257     do {
258         struct stat stat_buf;
259         size_t total_len;
260         char magic_buff[sizeof(DICT_BIN_MAGIC) - 1];
261         if (fstat(fd.fd(), &stat_buf) == -1 ||
262             static_cast<size_t>(stat_buf.st_size) <=
263                 sizeof(uint32_t) + sizeof(magic_buff)) {
264             break;
265         }
266         if (fs::safeRead(fd.fd(), magic_buff, sizeof(magic_buff)) !=
267             sizeof(magic_buff)) {
268             break;
269         }
270         if (memcmp(DICT_BIN_MAGIC, magic_buff, sizeof(magic_buff)) != 0) {
271             break;
272         }
273         total_len = stat_buf.st_size - sizeof(magic_buff);
274         data_.resize(total_len + 1);
275         if (fs::safeRead(fd.fd(), data_.data(), total_len) !=
276             static_cast<ssize_t>(total_len)) {
277             break;
278         }
279         data_[total_len] = '\0';
280 
281         auto lcount = load_le32(data_.data());
282         words_.resize(lcount);
283 
284         /* save words offset's. */
285         size_t i, j;
286         for (i = sizeof(uint32_t), j = 0; i < total_len && j < lcount; i += 1) {
287             i += sizeof(uint16_t);
288             int l = strlen(data_.data() + i);
289             if (!l) {
290                 continue;
291             }
292             words_[j++] = i;
293             i += l;
294         }
295         if (j < lcount || i < total_len) {
296             break;
297         }
298         return;
299     } while (0);
300 
301     throw std::runtime_error("failed to read dict file");
302 }
303 
requestDict(const std::string & lang)304 SpellCustomDict *SpellCustomDict::requestDict(const std::string &lang) {
305     if (checkLang(lang, "en")) {
306         return new SpellCustomDictEn;
307     }
308     return nullptr;
309 }
310 
checkDict(const std::string & lang)311 bool SpellCustomDict::checkDict(const std::string &lang) {
312     return !locateDictFile(lang).empty();
313 }
314 
getDistance(const char * word,int utf8Len,const char * dict)315 int SpellCustomDict::getDistance(const char *word, int utf8Len,
316                                  const char *dict) {
317 #define REPLACE_WEIGHT 3
318 #define INSERT_WEIGHT 3
319 #define REMOVE_WEIGHT 3
320 #define END_WEIGHT 1
321     /*
322      * three kinds of error, replace, insert and remove
323      * replace means apple vs aplle
324      * insert means apple vs applee
325      * remove means apple vs aple
326      *
327      * each error need to follow a correct match.
328      *
329      * number of "remove error" shoud be no more than "maxremove"
330      * while maxremove equals to (length - 2) / 3
331      *
332      * and the total error number should be no more than "maxdiff"
333      * while maxdiff equales to length / 3.
334      */
335     int replace = 0;
336     int insert = 0;
337     int remove = 0;
338     int diff = 0;
339     int maxdiff;
340     int maxremove;
341     unsigned int cur_word_c;
342     unsigned int cur_dict_c;
343     unsigned int next_word_c;
344     unsigned int next_dict_c;
345     maxdiff = utf8Len / 3;
346     maxremove = (utf8Len - 2) / 3;
347     word = fcitx_utf8_get_char(word, &cur_word_c);
348     dict = fcitx_utf8_get_char(dict, &cur_dict_c);
349     while ((diff = replace + insert + remove) <= maxdiff &&
350            remove <= maxremove) {
351         /*
352          * cur_word_c and cur_dict_c are the current characters
353          * and dict and word are pointing to the next one.
354          */
355         if (!cur_word_c) {
356             return ((replace * REPLACE_WEIGHT + insert * INSERT_WEIGHT +
357                      remove * REMOVE_WEIGHT) +
358                     (cur_dict_c
359                          ? (static_cast<int>(fcitx_utf8_strlen(dict)) + 1) *
360                                END_WEIGHT
361                          : 0));
362         }
363         word = fcitx_utf8_get_char(word, &next_word_c);
364 
365         /* check remove error */
366         if (!cur_dict_c) {
367             if (next_word_c) {
368                 return -1;
369             }
370             remove++;
371             if (diff <= maxdiff && remove <= maxremove) {
372                 return (replace * REPLACE_WEIGHT + insert * INSERT_WEIGHT +
373                         remove * REMOVE_WEIGHT);
374             }
375             return -1;
376         }
377         dict = fcitx_utf8_get_char(dict, &next_dict_c);
378         if (cur_word_c == cur_dict_c || (wordCompare(cur_word_c, cur_dict_c))) {
379             cur_word_c = next_word_c;
380             cur_dict_c = next_dict_c;
381             continue;
382         }
383         if (next_word_c == cur_dict_c ||
384             (next_word_c && wordCompare(next_word_c, cur_dict_c))) {
385             word = fcitx_utf8_get_char(word, &cur_word_c);
386             cur_dict_c = next_dict_c;
387             remove++;
388             continue;
389         }
390 
391         /* check insert error */
392         if (cur_word_c == next_dict_c ||
393             (next_dict_c && wordCompare(cur_word_c, next_dict_c))) {
394             cur_word_c = next_word_c;
395             dict = fcitx_utf8_get_char(dict, &cur_dict_c);
396             insert++;
397             continue;
398         }
399 
400         /* check replace error */
401         if (next_word_c == next_dict_c ||
402             (next_word_c && next_dict_c &&
403              wordCompare(next_word_c, next_dict_c))) {
404             if (next_word_c) {
405                 dict = fcitx_utf8_get_char(dict, &cur_dict_c);
406                 word = fcitx_utf8_get_char(word, &cur_word_c);
407             } else {
408                 cur_word_c = 0;
409                 cur_dict_c = 0;
410             }
411             replace++;
412             continue;
413         }
414         break;
415     }
416     return -1;
417 }
418 
hint(const std::string & str,size_t limit)419 std::vector<std::string> SpellCustomDict::hint(const std::string &str,
420                                                size_t limit) {
421     const char *word = str.c_str();
422     const char *real_word = word;
423     std::vector<std::string> result;
424     std::vector<std::pair<const char *, int>> tops;
425     if (!delim_.empty()) {
426         size_t delta;
427         while (real_word[delta = strcspn(real_word, delim_.c_str())]) {
428             real_word += delta + 1;
429         }
430     }
431     if (!real_word[0]) {
432         return {};
433     }
434     auto word_type = wordCheck(real_word);
435     int word_len = fcitx_utf8_strlen(real_word);
436     auto compare = [](const std::pair<const char *, int> &lhs,
437                       const std::pair<const char *, int> &rhs) {
438         return lhs.second < rhs.second;
439     };
440     for (const auto &wordOffset : words_) {
441         int dist;
442         const char *dictWord = data_.data() + wordOffset;
443         if ((dist = getDistance(real_word, word_len, dictWord)) >= 0) {
444             tops.emplace_back(dictWord, dist);
445             std::push_heap(tops.begin(), tops.end(), compare);
446             if (tops.size() > limit) {
447                 std::pop_heap(tops.begin(), tops.end(), compare);
448                 tops.pop_back();
449             }
450         }
451     }
452 
453     // Or sort heap?..
454     std::sort(tops.begin(), tops.end(), compare);
455 
456     result.reserve(tops.size());
457     for (auto &top : tops) {
458         result.emplace_back(top.first);
459     }
460     hintComplete(result, word_type);
461     return result;
462 }
463 } // namespace fcitx
464