1 /*
2  * HunspellSpellingEngine.cpp
3  *
4  * Copyright (C) 2021 by RStudio, PBC
5  *
6  * Unless you have received this program directly from RStudio pursuant
7  * to the terms of a commercial license agreement with RStudio, then
8  * this program is licensed to you under the terms of version 3 of the
9  * GNU Affero General Public License. This program is distributed WITHOUT
10  * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12  * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13  *
14  */
15 
16 #include <core/spelling/HunspellSpellingEngine.hpp>
17 
18 #include <boost/algorithm/string.hpp>
19 
20 #include <core/Log.hpp>
21 #include <core/FileSerializer.hpp>
22 #include <core/StringUtils.hpp>
23 
24 #include <core/spelling/HunspellDictionaryManager.hpp>
25 
26 #include <shared_core/Error.hpp>
27 #include <shared_core/FilePath.hpp>
28 
29 // Including the hunspell headers caused compilation errors for Windows 64-bit
30 // builds. The trouble seemd to be a 'near' macro defined somewhere in the
31 // mingw toolchain (couldn't find where). Fix this by undefining 'near' right
32 // before we include hunspell.hxx.
33 #if defined(near)
34 #undef near
35 #endif
36 #include "hunspell/hunspell.hxx"
37 
38 namespace rstudio {
39 namespace core {
40 namespace spelling {
41 
42 namespace {
43 
44 // remove morphological description from text
removeMorphologicalDescription(std::string * pText)45 void removeMorphologicalDescription(std::string* pText)
46 {
47    std::size_t tabPos = pText->find('\t');
48    if (tabPos != std::string::npos)
49       *pText = pText->substr(0, tabPos);
50 }
51 
52 // extract the word from the dic_delta line -- remove the
53 // optional affix and then replace escaped / chracters
parseDicDeltaLine(std::string line,std::string * pWord,std::string * pAffix)54 bool parseDicDeltaLine(std::string line,
55                        std::string* pWord,
56                        std::string* pAffix)
57 {
58    // skip empty lines
59    boost::algorithm::trim(line);
60    if (line.empty())
61       return false;
62 
63    // find delimiter
64    std::size_t wordEndPos = line.size();
65    for (std::size_t i = 0; i < line.size(); i++)
66    {
67       if (line[i] == '/' && i > 0 && line[i - 1] != '\\')
68       {
69          wordEndPos = i;
70          break;
71       }
72    }
73 
74    // extract word and escape forward slashes
75    std::string word = line.substr(0, wordEndPos);
76    *pWord = boost::algorithm::replace_all_copy(word, "\\/", "/");
77 
78    // extract affix (if any)
79    if (wordEndPos < line.size() - 1)
80    {
81       *pAffix = line.substr(wordEndPos + 1);
82       removeMorphologicalDescription(pAffix);
83    }
84    else
85    {
86       pAffix->clear();
87       removeMorphologicalDescription(pWord);
88    }
89 
90    return true;
91 }
92 
93 // The hunspell api allows you to add words with affixes by providing an
94 // example word already in the dictionary that has the same affix. The google
95 // english .dic_delta files use the hard-coded integer values 6 and 7 to
96 // (respecitvely) indicate possesive (M) and possesive/plural (MS) affixes.
97 // Therefore, this function needs to return words that are marked as
98 // M or MS consistently in the main dictionaries of the 4 english variations.
99 // If we want to extend affix support to other languages we'll need to
100 // do a simillar mapping
exampleWordForEnglishAffix(const std::string & affix)101 std::string exampleWordForEnglishAffix(const std::string& affix)
102 {
103    if (affix == "6") // possesive (M)
104       return "Arcadia";
105    else if (affix == "7") // possessive or plural (MS)
106       return "beverage";
107    else
108       return std::string();
109 }
110 
111 class SpellChecker : boost::noncopyable
112 {
113 public:
~SpellChecker()114    virtual ~SpellChecker() {}
115    virtual Error checkSpelling(const std::string& word, bool *pCorrect) = 0;
116    virtual Error suggestionList(const std::string& word,
117                                 std::vector<std::string>* pSugs) = 0;
118    virtual Error wordChars(std::wstring* pWordChars) = 0;
119 };
120 
121 class NoSpellChecker : public SpellChecker
122 {
123 public:
checkSpelling(const std::string & word,bool * pCorrect)124    Error checkSpelling(const std::string& word, bool *pCorrect)
125    {
126       *pCorrect = true;
127       return Success();
128    }
129 
suggestionList(const std::string & word,std::vector<std::string> * pSugs)130    Error suggestionList(const std::string& word,
131                         std::vector<std::string>* pSugs)
132    {
133       return Success();
134    }
135 
wordChars(std::wstring * pWordChars)136    Error wordChars(std::wstring *pWordChars)
137    {
138       return Success();
139    }
140 };
141 
142 class HunspellSpellChecker : public SpellChecker
143 {
144 public:
HunspellSpellChecker()145    HunspellSpellChecker()
146    {
147    }
148 
~HunspellSpellChecker()149    virtual ~HunspellSpellChecker()
150    {
151       try
152       {
153          pHunspell_.reset();
154       }
155       catch(...)
156       {
157       }
158    }
159 
initialize(const HunspellDictionary & dictionary,const IconvstrFunction & iconvstrFunc)160    Error initialize(const HunspellDictionary& dictionary,
161                     const IconvstrFunction& iconvstrFunc)
162    {
163       // validate that dictionaries exist
164       if (!dictionary.affPath().exists())
165          return core::fileNotFoundError(dictionary.affPath(), ERROR_LOCATION);
166       if (!dictionary.dicPath().exists())
167          return core::fileNotFoundError(dictionary.dicPath(), ERROR_LOCATION);
168 
169       // convert paths to system encoding before sending to external API
170       std::string systemAffPath = string_utils::utf8ToSystem(
171          dictionary.affPath().getAbsolutePath());
172       std::string systemDicPath = string_utils::utf8ToSystem(
173          dictionary.dicPath().getAbsolutePath());
174 
175       // initialize hunspell, iconvstrFunc_, and encoding_
176       pHunspell_.reset(new Hunspell(systemAffPath.c_str(),
177                                     systemDicPath.c_str()));
178       iconvstrFunc_ = iconvstrFunc;
179       encoding_ = pHunspell_->get_dic_encoding();
180 
181       // add words from dic_delta if available
182       FilePath dicPath = dictionary.dicPath();
183       FilePath dicDeltaPath = dicPath.getParent().completeChildPath(
184          dicPath.getStem() + ".dic_delta");
185       if (dicDeltaPath.exists())
186       {
187          Error error = mergeDicDeltaFile(dicDeltaPath);
188          if (error)
189             LOG_ERROR(error);
190       }
191 
192       // return success
193       return Success();
194    }
195 
wordChars(std::wstring * pWordChars)196    Error wordChars(std::wstring *pWordChars)
197    {
198       int len;
199       unsigned short *pChars = pHunspell_->get_wordchars_utf16(&len);
200 
201       for (int i = 0; i < len; i++)
202          pWordChars->push_back(pChars[i]);
203 
204       return Success();
205    }
206 
207 private:
208 
209    // helpers
copyAndFreeHunspellVector(std::vector<std::string> * pVec,char ** wlst,int len)210    void copyAndFreeHunspellVector(std::vector<std::string>* pVec,
211                                     char **wlst,
212                                     int len)
213    {
214       for (int i=0; i < len; i++)
215       {
216          pVec->push_back(wlst[i]);
217       }
218       pHunspell_->free_list(&wlst, len);
219    }
220 
mergeDicDeltaFile(const FilePath & dicDeltaPath)221    Error mergeDicDeltaFile(const FilePath& dicDeltaPath)
222    {
223       // determine whether we are going to support affixes -- we do this for
224       // english only right now because we can correctly (by inspection) map
225       // the chromium numeric affix indicators (6 and 7) to the right
226       // hunspell example words. it's worth investigating whether we can do
227       // this for other languages as well
228       bool addAffixes = boost::algorithm::starts_with(dicDeltaPath.getStem(),
229                                                       "en_");
230 
231       // read the file and strip the BOM
232       std::string contents;
233       Error error = core::readStringFromFile(dicDeltaPath, &contents);
234       if (error)
235          return error;
236       core::stripBOM(&contents);
237 
238       // split into lines
239       std::vector<std::string> lines;
240       boost::algorithm::split(lines,
241                               contents,
242                               boost::algorithm::is_any_of("\n"));
243 
244       // parse lines for words
245       bool added;
246       std::string word, affix, example;
247       for (const std::string& line : lines)
248       {
249          if (parseDicDeltaLine(line, &word, &affix))
250          {
251             example = exampleWordForEnglishAffix(affix);
252             if (!example.empty() && addAffixes)
253             {
254                Error error = addWordWithAffix(word, example, &added);
255                if (error)
256                   LOG_ERROR(error);
257             }
258             else
259             {
260                Error error = addWord(word, &added);
261                if (error)
262                   LOG_ERROR(error);
263             }
264          }
265       }
266 
267       return Success();
268    }
269 
270 
271 public:
checkSpelling(const std::string & word,bool * pCorrect)272    Error checkSpelling(const std::string& word, bool *pCorrect)
273    {
274       std::string encoded;
275       Error error = iconvstrFunc_(word,"UTF-8",encoding_,false,&encoded);
276       if (error)
277          return error;
278 
279       *pCorrect = pHunspell_->spell(encoded.c_str());
280       return Success();
281    }
282 
suggestionList(const std::string & word,std::vector<std::string> * pSug)283    Error suggestionList(const std::string& word, std::vector<std::string>* pSug)
284    {
285       std::string encoded;
286       Error error = iconvstrFunc_(word,"UTF-8",encoding_,false,&encoded);
287       if (error)
288          return error;
289 
290       char ** wlst;
291       int ns = pHunspell_->suggest(&wlst,encoded.c_str());
292       copyAndFreeHunspellVector(pSug,wlst,ns);
293 
294       for (std::string& sug : *pSug)
295       {
296          error = iconvstrFunc_(sug, encoding_, "UTF-8", true, &sug);
297          if (error)
298             return error;
299       }
300 
301       return Success();
302    }
303 
addWord(const std::string & word,bool * pAdded)304    Error addWord(const std::string& word, bool *pAdded)
305    {
306       std::string encoded;
307       Error error = iconvstrFunc_(word,"UTF-8",encoding_,false,&encoded);
308       if (error)
309          return error;
310 
311       // Following the Hunspell::add method through it's various code paths
312       // it seems the return value is always 0, meaning there's really no
313       // error ever thrown if the method fails.
314       *pAdded = (pHunspell_->add(encoded.c_str()) == 0);
315       return Success();
316    }
317 
addWordWithAffix(const std::string & word,const std::string & example,bool * pAdded)318    Error addWordWithAffix(const std::string& word,
319                           const std::string& example,
320                           bool *pAdded)
321    {
322       std::string wordEncoded;
323       Error error = iconvstrFunc_(word,
324                                   "UTF-8",
325                                   encoding_,
326                                   false,
327                                   &wordEncoded);
328       if (error)
329          return error;
330 
331       std::string exampleEncoded;
332       error = iconvstrFunc_(example,
333                             "UTF-8",
334                             encoding_,
335                             false,
336                             &exampleEncoded);
337       if (error)
338          return error;
339 
340       *pAdded = (pHunspell_->add_with_affix(wordEncoded.c_str(),
341                                             exampleEncoded.c_str()) == 0);
342       return Success();
343    }
344 
345    // Hunspell dictionary files are simple: the first line is an integer
346    // indicating the number of entries (one per line), and each line contains
347    // a word followed by '/' plus modifier flags. Example user.dic:
348    // ----------
349    // 3
350    // lol/S
351    // rofl/S
352    // tl;dr/S
353    // ----------
354    // The '/S' modifier treats 'ROFL','rofl', and 'Rofl' as correct spellings.
addDictionary(const FilePath & dicPath,const std::string & key,bool * pAdded)355    Error addDictionary(const FilePath& dicPath,
356                        const std::string& key,
357                        bool *pAdded)
358    {
359       if (!dicPath.exists())
360          return core::fileNotFoundError(dicPath, ERROR_LOCATION);
361 
362       // Convert path to system encoding before sending to external api
363       std::string systemDicPath = string_utils::utf8ToSystem(dicPath.getAbsolutePath());
364       *pAdded = (pHunspell_->add_dic(systemDicPath.c_str(),key.c_str()) == 0);
365       return Success();
366    }
367 
368 private:
369    boost::scoped_ptr<Hunspell> pHunspell_;
370    IconvstrFunction iconvstrFunc_;
371    std::string encoding_;
372 };
373 
374 } // anonymous namespace
375 
376 struct HunspellSpellingEngine::Impl
377 {
Implrstudio::core::spelling::HunspellSpellingEngine::Impl378    Impl(const std::string& langId,
379         const HunspellDictionaryManager& dictionaryManager,
380         const IconvstrFunction& iconvstrFunction)
381       : currentLangId_(langId),
382         dictManager_(dictionaryManager),
383         iconvstrFunction_(iconvstrFunction)
384    {
385    }
386 
useDictionaryrstudio::core::spelling::HunspellSpellingEngine::Impl387    void useDictionary(const std::string& langId)
388    {
389       if (dictionaryContextChanged(langId))
390          resetDictionaries(langId);
391    }
392 
spellCheckerrstudio::core::spelling::HunspellSpellingEngine::Impl393    SpellChecker& spellChecker()
394    {
395       if (!pSpellChecker_)
396          resetDictionaries(currentLangId_);
397 
398       return *pSpellChecker_;
399    }
400 
401 private:
dictionaryContextChangedrstudio::core::spelling::HunspellSpellingEngine::Impl402    bool dictionaryContextChanged(const std::string& langId)
403    {
404       return(langId != currentLangId_ ||
405              dictManager_.custom().dictionaries() != currentCustomDicts_);
406    }
407 
resetDictionariesrstudio::core::spelling::HunspellSpellingEngine::Impl408    void resetDictionaries(const std::string& langId)
409    {
410       HunspellDictionary dict = dictManager_.dictionaryForLanguageId(langId);
411       if (!dict.empty())
412       {
413          HunspellSpellChecker* pHunspell = new HunspellSpellChecker();
414          pSpellChecker_.reset(pHunspell);
415 
416          Error error = pHunspell->initialize(dict, iconvstrFunction_);
417          if (!error)
418          {
419             currentLangId_ = langId;
420             currentCustomDicts_ = dictManager_.custom().dictionaries();
421             for (const std::string& dict : currentCustomDicts_)
422             {
423                bool added;
424                FilePath dicPath = dictManager_.custom().dictionaryPath(dict);
425                Error error = pHunspell->addDictionary(dicPath,
426                                                       dicPath.getStem(),
427                                                       &added);
428                if (error)
429                   LOG_ERROR(error);
430             }
431          }
432          else
433          {
434             LOG_ERROR(error);
435 
436             pSpellChecker_.reset(new NoSpellChecker());
437          }
438       }
439       else
440       {
441          pSpellChecker_.reset(new NoSpellChecker());
442       }
443    }
444 
445 
446 
447 private:
448    std::string currentLangId_;
449    std::vector<std::string> currentCustomDicts_;
450    HunspellDictionaryManager dictManager_;
451    IconvstrFunction iconvstrFunction_;
452    boost::shared_ptr<SpellChecker> pSpellChecker_;
453 };
454 
455 
HunspellSpellingEngine(const std::string & langId,const HunspellDictionaryManager & dictionaryManager,const IconvstrFunction & iconvstrFunction)456 HunspellSpellingEngine::HunspellSpellingEngine(
457                            const std::string& langId,
458                            const HunspellDictionaryManager& dictionaryManager,
459                            const IconvstrFunction& iconvstrFunction)
460    : pImpl_(new Impl(langId, dictionaryManager, iconvstrFunction))
461 {
462 }
463 
464 
useDictionary(const std::string & langId)465 void HunspellSpellingEngine::useDictionary(const std::string& langId)
466 {
467    pImpl_->useDictionary(langId);
468 }
469 
checkSpelling(const std::string & word,bool * pCorrect)470 Error HunspellSpellingEngine::checkSpelling(const std::string& word,
471                                             bool *pCorrect)
472 {
473    return pImpl_->spellChecker().checkSpelling(word, pCorrect);
474 }
475 
suggestionList(const std::string & word,std::vector<std::string> * pSugs)476 Error HunspellSpellingEngine::suggestionList(const std::string& word,
477                                              std::vector<std::string>* pSugs)
478 {
479    return pImpl_->spellChecker().suggestionList(word, pSugs);
480 }
481 
wordChars(std::wstring * pChars)482 Error HunspellSpellingEngine::wordChars(std::wstring *pChars)
483 {
484    return pImpl_->spellChecker().wordChars(pChars);
485 }
486 
487 } // namespace spelling
488 } // namespace core
489 } // namespace rstudio
490 
491 
492 
493