1 /*
2 * HunspellSpellingEngine.cpp
3 *
4 * Copyright (C) 2021 by RStudio, PBC
5 *
6 * Unless you have received this program directly from RStudio pursuant
7 * to the terms of a commercial license agreement with RStudio, then
8 * this program is licensed to you under the terms of version 3 of the
9 * GNU Affero General Public License. This program is distributed WITHOUT
10 * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12 * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13 *
14 */
15
16 #include <core/spelling/HunspellSpellingEngine.hpp>
17
18 #include <boost/algorithm/string.hpp>
19
20 #include <core/Log.hpp>
21 #include <core/FileSerializer.hpp>
22 #include <core/StringUtils.hpp>
23
24 #include <core/spelling/HunspellDictionaryManager.hpp>
25
26 #include <shared_core/Error.hpp>
27 #include <shared_core/FilePath.hpp>
28
29 // Including the hunspell headers caused compilation errors for Windows 64-bit
30 // builds. The trouble seemd to be a 'near' macro defined somewhere in the
31 // mingw toolchain (couldn't find where). Fix this by undefining 'near' right
32 // before we include hunspell.hxx.
33 #if defined(near)
34 #undef near
35 #endif
36 #include "hunspell/hunspell.hxx"
37
38 namespace rstudio {
39 namespace core {
40 namespace spelling {
41
42 namespace {
43
44 // remove morphological description from text
removeMorphologicalDescription(std::string * pText)45 void removeMorphologicalDescription(std::string* pText)
46 {
47 std::size_t tabPos = pText->find('\t');
48 if (tabPos != std::string::npos)
49 *pText = pText->substr(0, tabPos);
50 }
51
52 // extract the word from the dic_delta line -- remove the
53 // optional affix and then replace escaped / chracters
parseDicDeltaLine(std::string line,std::string * pWord,std::string * pAffix)54 bool parseDicDeltaLine(std::string line,
55 std::string* pWord,
56 std::string* pAffix)
57 {
58 // skip empty lines
59 boost::algorithm::trim(line);
60 if (line.empty())
61 return false;
62
63 // find delimiter
64 std::size_t wordEndPos = line.size();
65 for (std::size_t i = 0; i < line.size(); i++)
66 {
67 if (line[i] == '/' && i > 0 && line[i - 1] != '\\')
68 {
69 wordEndPos = i;
70 break;
71 }
72 }
73
74 // extract word and escape forward slashes
75 std::string word = line.substr(0, wordEndPos);
76 *pWord = boost::algorithm::replace_all_copy(word, "\\/", "/");
77
78 // extract affix (if any)
79 if (wordEndPos < line.size() - 1)
80 {
81 *pAffix = line.substr(wordEndPos + 1);
82 removeMorphologicalDescription(pAffix);
83 }
84 else
85 {
86 pAffix->clear();
87 removeMorphologicalDescription(pWord);
88 }
89
90 return true;
91 }
92
93 // The hunspell api allows you to add words with affixes by providing an
94 // example word already in the dictionary that has the same affix. The google
95 // english .dic_delta files use the hard-coded integer values 6 and 7 to
96 // (respecitvely) indicate possesive (M) and possesive/plural (MS) affixes.
97 // Therefore, this function needs to return words that are marked as
98 // M or MS consistently in the main dictionaries of the 4 english variations.
99 // If we want to extend affix support to other languages we'll need to
100 // do a simillar mapping
exampleWordForEnglishAffix(const std::string & affix)101 std::string exampleWordForEnglishAffix(const std::string& affix)
102 {
103 if (affix == "6") // possesive (M)
104 return "Arcadia";
105 else if (affix == "7") // possessive or plural (MS)
106 return "beverage";
107 else
108 return std::string();
109 }
110
111 class SpellChecker : boost::noncopyable
112 {
113 public:
~SpellChecker()114 virtual ~SpellChecker() {}
115 virtual Error checkSpelling(const std::string& word, bool *pCorrect) = 0;
116 virtual Error suggestionList(const std::string& word,
117 std::vector<std::string>* pSugs) = 0;
118 virtual Error wordChars(std::wstring* pWordChars) = 0;
119 };
120
121 class NoSpellChecker : public SpellChecker
122 {
123 public:
checkSpelling(const std::string & word,bool * pCorrect)124 Error checkSpelling(const std::string& word, bool *pCorrect)
125 {
126 *pCorrect = true;
127 return Success();
128 }
129
suggestionList(const std::string & word,std::vector<std::string> * pSugs)130 Error suggestionList(const std::string& word,
131 std::vector<std::string>* pSugs)
132 {
133 return Success();
134 }
135
wordChars(std::wstring * pWordChars)136 Error wordChars(std::wstring *pWordChars)
137 {
138 return Success();
139 }
140 };
141
142 class HunspellSpellChecker : public SpellChecker
143 {
144 public:
HunspellSpellChecker()145 HunspellSpellChecker()
146 {
147 }
148
~HunspellSpellChecker()149 virtual ~HunspellSpellChecker()
150 {
151 try
152 {
153 pHunspell_.reset();
154 }
155 catch(...)
156 {
157 }
158 }
159
initialize(const HunspellDictionary & dictionary,const IconvstrFunction & iconvstrFunc)160 Error initialize(const HunspellDictionary& dictionary,
161 const IconvstrFunction& iconvstrFunc)
162 {
163 // validate that dictionaries exist
164 if (!dictionary.affPath().exists())
165 return core::fileNotFoundError(dictionary.affPath(), ERROR_LOCATION);
166 if (!dictionary.dicPath().exists())
167 return core::fileNotFoundError(dictionary.dicPath(), ERROR_LOCATION);
168
169 // convert paths to system encoding before sending to external API
170 std::string systemAffPath = string_utils::utf8ToSystem(
171 dictionary.affPath().getAbsolutePath());
172 std::string systemDicPath = string_utils::utf8ToSystem(
173 dictionary.dicPath().getAbsolutePath());
174
175 // initialize hunspell, iconvstrFunc_, and encoding_
176 pHunspell_.reset(new Hunspell(systemAffPath.c_str(),
177 systemDicPath.c_str()));
178 iconvstrFunc_ = iconvstrFunc;
179 encoding_ = pHunspell_->get_dic_encoding();
180
181 // add words from dic_delta if available
182 FilePath dicPath = dictionary.dicPath();
183 FilePath dicDeltaPath = dicPath.getParent().completeChildPath(
184 dicPath.getStem() + ".dic_delta");
185 if (dicDeltaPath.exists())
186 {
187 Error error = mergeDicDeltaFile(dicDeltaPath);
188 if (error)
189 LOG_ERROR(error);
190 }
191
192 // return success
193 return Success();
194 }
195
wordChars(std::wstring * pWordChars)196 Error wordChars(std::wstring *pWordChars)
197 {
198 int len;
199 unsigned short *pChars = pHunspell_->get_wordchars_utf16(&len);
200
201 for (int i = 0; i < len; i++)
202 pWordChars->push_back(pChars[i]);
203
204 return Success();
205 }
206
207 private:
208
209 // helpers
copyAndFreeHunspellVector(std::vector<std::string> * pVec,char ** wlst,int len)210 void copyAndFreeHunspellVector(std::vector<std::string>* pVec,
211 char **wlst,
212 int len)
213 {
214 for (int i=0; i < len; i++)
215 {
216 pVec->push_back(wlst[i]);
217 }
218 pHunspell_->free_list(&wlst, len);
219 }
220
mergeDicDeltaFile(const FilePath & dicDeltaPath)221 Error mergeDicDeltaFile(const FilePath& dicDeltaPath)
222 {
223 // determine whether we are going to support affixes -- we do this for
224 // english only right now because we can correctly (by inspection) map
225 // the chromium numeric affix indicators (6 and 7) to the right
226 // hunspell example words. it's worth investigating whether we can do
227 // this for other languages as well
228 bool addAffixes = boost::algorithm::starts_with(dicDeltaPath.getStem(),
229 "en_");
230
231 // read the file and strip the BOM
232 std::string contents;
233 Error error = core::readStringFromFile(dicDeltaPath, &contents);
234 if (error)
235 return error;
236 core::stripBOM(&contents);
237
238 // split into lines
239 std::vector<std::string> lines;
240 boost::algorithm::split(lines,
241 contents,
242 boost::algorithm::is_any_of("\n"));
243
244 // parse lines for words
245 bool added;
246 std::string word, affix, example;
247 for (const std::string& line : lines)
248 {
249 if (parseDicDeltaLine(line, &word, &affix))
250 {
251 example = exampleWordForEnglishAffix(affix);
252 if (!example.empty() && addAffixes)
253 {
254 Error error = addWordWithAffix(word, example, &added);
255 if (error)
256 LOG_ERROR(error);
257 }
258 else
259 {
260 Error error = addWord(word, &added);
261 if (error)
262 LOG_ERROR(error);
263 }
264 }
265 }
266
267 return Success();
268 }
269
270
271 public:
checkSpelling(const std::string & word,bool * pCorrect)272 Error checkSpelling(const std::string& word, bool *pCorrect)
273 {
274 std::string encoded;
275 Error error = iconvstrFunc_(word,"UTF-8",encoding_,false,&encoded);
276 if (error)
277 return error;
278
279 *pCorrect = pHunspell_->spell(encoded.c_str());
280 return Success();
281 }
282
suggestionList(const std::string & word,std::vector<std::string> * pSug)283 Error suggestionList(const std::string& word, std::vector<std::string>* pSug)
284 {
285 std::string encoded;
286 Error error = iconvstrFunc_(word,"UTF-8",encoding_,false,&encoded);
287 if (error)
288 return error;
289
290 char ** wlst;
291 int ns = pHunspell_->suggest(&wlst,encoded.c_str());
292 copyAndFreeHunspellVector(pSug,wlst,ns);
293
294 for (std::string& sug : *pSug)
295 {
296 error = iconvstrFunc_(sug, encoding_, "UTF-8", true, &sug);
297 if (error)
298 return error;
299 }
300
301 return Success();
302 }
303
addWord(const std::string & word,bool * pAdded)304 Error addWord(const std::string& word, bool *pAdded)
305 {
306 std::string encoded;
307 Error error = iconvstrFunc_(word,"UTF-8",encoding_,false,&encoded);
308 if (error)
309 return error;
310
311 // Following the Hunspell::add method through it's various code paths
312 // it seems the return value is always 0, meaning there's really no
313 // error ever thrown if the method fails.
314 *pAdded = (pHunspell_->add(encoded.c_str()) == 0);
315 return Success();
316 }
317
addWordWithAffix(const std::string & word,const std::string & example,bool * pAdded)318 Error addWordWithAffix(const std::string& word,
319 const std::string& example,
320 bool *pAdded)
321 {
322 std::string wordEncoded;
323 Error error = iconvstrFunc_(word,
324 "UTF-8",
325 encoding_,
326 false,
327 &wordEncoded);
328 if (error)
329 return error;
330
331 std::string exampleEncoded;
332 error = iconvstrFunc_(example,
333 "UTF-8",
334 encoding_,
335 false,
336 &exampleEncoded);
337 if (error)
338 return error;
339
340 *pAdded = (pHunspell_->add_with_affix(wordEncoded.c_str(),
341 exampleEncoded.c_str()) == 0);
342 return Success();
343 }
344
345 // Hunspell dictionary files are simple: the first line is an integer
346 // indicating the number of entries (one per line), and each line contains
347 // a word followed by '/' plus modifier flags. Example user.dic:
348 // ----------
349 // 3
350 // lol/S
351 // rofl/S
352 // tl;dr/S
353 // ----------
354 // The '/S' modifier treats 'ROFL','rofl', and 'Rofl' as correct spellings.
addDictionary(const FilePath & dicPath,const std::string & key,bool * pAdded)355 Error addDictionary(const FilePath& dicPath,
356 const std::string& key,
357 bool *pAdded)
358 {
359 if (!dicPath.exists())
360 return core::fileNotFoundError(dicPath, ERROR_LOCATION);
361
362 // Convert path to system encoding before sending to external api
363 std::string systemDicPath = string_utils::utf8ToSystem(dicPath.getAbsolutePath());
364 *pAdded = (pHunspell_->add_dic(systemDicPath.c_str(),key.c_str()) == 0);
365 return Success();
366 }
367
368 private:
369 boost::scoped_ptr<Hunspell> pHunspell_;
370 IconvstrFunction iconvstrFunc_;
371 std::string encoding_;
372 };
373
374 } // anonymous namespace
375
376 struct HunspellSpellingEngine::Impl
377 {
Implrstudio::core::spelling::HunspellSpellingEngine::Impl378 Impl(const std::string& langId,
379 const HunspellDictionaryManager& dictionaryManager,
380 const IconvstrFunction& iconvstrFunction)
381 : currentLangId_(langId),
382 dictManager_(dictionaryManager),
383 iconvstrFunction_(iconvstrFunction)
384 {
385 }
386
useDictionaryrstudio::core::spelling::HunspellSpellingEngine::Impl387 void useDictionary(const std::string& langId)
388 {
389 if (dictionaryContextChanged(langId))
390 resetDictionaries(langId);
391 }
392
spellCheckerrstudio::core::spelling::HunspellSpellingEngine::Impl393 SpellChecker& spellChecker()
394 {
395 if (!pSpellChecker_)
396 resetDictionaries(currentLangId_);
397
398 return *pSpellChecker_;
399 }
400
401 private:
dictionaryContextChangedrstudio::core::spelling::HunspellSpellingEngine::Impl402 bool dictionaryContextChanged(const std::string& langId)
403 {
404 return(langId != currentLangId_ ||
405 dictManager_.custom().dictionaries() != currentCustomDicts_);
406 }
407
resetDictionariesrstudio::core::spelling::HunspellSpellingEngine::Impl408 void resetDictionaries(const std::string& langId)
409 {
410 HunspellDictionary dict = dictManager_.dictionaryForLanguageId(langId);
411 if (!dict.empty())
412 {
413 HunspellSpellChecker* pHunspell = new HunspellSpellChecker();
414 pSpellChecker_.reset(pHunspell);
415
416 Error error = pHunspell->initialize(dict, iconvstrFunction_);
417 if (!error)
418 {
419 currentLangId_ = langId;
420 currentCustomDicts_ = dictManager_.custom().dictionaries();
421 for (const std::string& dict : currentCustomDicts_)
422 {
423 bool added;
424 FilePath dicPath = dictManager_.custom().dictionaryPath(dict);
425 Error error = pHunspell->addDictionary(dicPath,
426 dicPath.getStem(),
427 &added);
428 if (error)
429 LOG_ERROR(error);
430 }
431 }
432 else
433 {
434 LOG_ERROR(error);
435
436 pSpellChecker_.reset(new NoSpellChecker());
437 }
438 }
439 else
440 {
441 pSpellChecker_.reset(new NoSpellChecker());
442 }
443 }
444
445
446
447 private:
448 std::string currentLangId_;
449 std::vector<std::string> currentCustomDicts_;
450 HunspellDictionaryManager dictManager_;
451 IconvstrFunction iconvstrFunction_;
452 boost::shared_ptr<SpellChecker> pSpellChecker_;
453 };
454
455
HunspellSpellingEngine(const std::string & langId,const HunspellDictionaryManager & dictionaryManager,const IconvstrFunction & iconvstrFunction)456 HunspellSpellingEngine::HunspellSpellingEngine(
457 const std::string& langId,
458 const HunspellDictionaryManager& dictionaryManager,
459 const IconvstrFunction& iconvstrFunction)
460 : pImpl_(new Impl(langId, dictionaryManager, iconvstrFunction))
461 {
462 }
463
464
useDictionary(const std::string & langId)465 void HunspellSpellingEngine::useDictionary(const std::string& langId)
466 {
467 pImpl_->useDictionary(langId);
468 }
469
checkSpelling(const std::string & word,bool * pCorrect)470 Error HunspellSpellingEngine::checkSpelling(const std::string& word,
471 bool *pCorrect)
472 {
473 return pImpl_->spellChecker().checkSpelling(word, pCorrect);
474 }
475
suggestionList(const std::string & word,std::vector<std::string> * pSugs)476 Error HunspellSpellingEngine::suggestionList(const std::string& word,
477 std::vector<std::string>* pSugs)
478 {
479 return pImpl_->spellChecker().suggestionList(word, pSugs);
480 }
481
wordChars(std::wstring * pChars)482 Error HunspellSpellingEngine::wordChars(std::wstring *pChars)
483 {
484 return pImpl_->spellChecker().wordChars(pChars);
485 }
486
487 } // namespace spelling
488 } // namespace core
489 } // namespace rstudio
490
491
492
493