1 /*  This file is part of the KDE libraries
2     SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
3     SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
4     SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org>
5 
6     SPDX-License-Identifier: LGPL-2.0-or-later
7 */
8 
9 #include <QCoreApplication>
10 #include <QDataStream>
11 #include <QFile>
12 #include <QLocale>
13 #include <QStandardPaths>
14 
15 #include "core_debug.h"
16 #include "guesslanguage.h"
17 #include "loader_p.h"
18 #include "speller.h"
19 #include "spellerplugin_p.h"
20 #include "tokenizer_p.h"
21 
22 /*
23 All language tags should be valid according to IETF BCP 47, as codified in RFC 4646.
24 ISO 639-1 codes should be used for the language part except for cases where there
25 exists no code, then 639-3 codes should be used. Country codes should only be used
26 in special cases. Scripts can be differentiated by IANA subtags, available here:
27 http://www.iana.org/assignments/language-subtag-registry
28 The script tags correspond to ISO 15924
29 
30 An overview of the best practices concerning language tagging is available here:
31 http://www.w3.org/International/articles/language-tags/Overview.en.php
32 
33 lang tags should use underscores (_) rather than hyphens (-) to separate subsections.
34 
35 EXCEPTIONS:
36 For cases of known differences from the above tagging scheme and major
37 spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers
38 shall be used. All exception shall be noted here:
39 
40 BCP        SPELLCHECK
41 az-Latn    az
42 
43 */
44 
45 namespace Sonnet
46 {
47 class GuessLanguagePrivate
48 {
49 public:
50     GuessLanguagePrivate();
51     //            language       trigram  score
52     static QHash<QString, QHash<QString, int>> s_knownModels;
53 
54     void loadModels();
55     QList<QChar::Script> findRuns(const QString &text);
56     QVector<QString> createOrderedModel(const QString &content);
57     int distance(const QVector<QString> &model, const QHash<QString, int> &knownModel);
58     QStringList guessFromTrigrams(const QString &sample, const QStringList &langs);
59     QStringList identify(const QString &sample, const QList<QChar::Script> &scripts);
60     QString guessFromDictionaries(const QString &sentence, const QStringList &candidates);
61 
62     static QSet<QString> s_knownDictionaries;
63     static QMultiHash<QChar::Script, QString> s_scriptLanguages;
64     static QMap<QString, QString> s_dictionaryNameMap;
65 
66     const int MIN_LENGTH;
67     int m_maxItems;
68     double m_minConfidence;
69 };
70 
71 QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels;
72 QSet<QString> GuessLanguagePrivate::s_knownDictionaries;
73 QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages;
74 QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap;
75 
getNames(QLocale::Script script)76 QStringList getNames(QLocale::Script script)
77 {
78     QStringList locales;
79     const auto matchingLocales = QLocale::matchingLocales(QLocale::AnyLanguage, script, QLocale::AnyCountry);
80     locales.reserve(matchingLocales.size());
81     for (const QLocale &locale : matchingLocales) {
82         locales << locale.name();
83     }
84     return locales;
85 }
86 
GuessLanguagePrivate()87 GuessLanguagePrivate::GuessLanguagePrivate()
88     : MIN_LENGTH(5)
89     , m_maxItems(1)
90     , m_minConfidence(0)
91 {
92     if (!s_scriptLanguages.isEmpty()) {
93         return;
94     }
95 
96     const QStringList languages = Loader::openLoader()->languages();
97     s_knownDictionaries = QSet<QString>(languages.begin(), languages.end());
98     QSet<QString> dictionaryLanguages;
99     for (const QString &dictName : std::as_const(s_knownDictionaries)) {
100         QString languageName = QLocale(dictName).name();
101         if (languageName.isEmpty()) {
102             qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName;
103             continue;
104         }
105         dictionaryLanguages.insert(languageName);
106     }
107 
108     QSet<QString> allLanguages;
109     for (int i = 0; i < int(QChar::ScriptCount); i++) {
110         QChar::Script script = static_cast<QChar::Script>(i);
111         QStringList names;
112         switch (script) {
113         case QChar::Script_Latin:
114             names = getNames(QLocale::LatinScript);
115             break;
116         case QChar::Script_Greek:
117             names = getNames(QLocale::GreekScript);
118             break;
119         case QChar::Script_Cyrillic:
120             names = getNames(QLocale::CyrillicScript);
121             break;
122         case QChar::Script_Armenian:
123             names = getNames(QLocale::ArmenianScript);
124             break;
125         case QChar::Script_Hebrew:
126             names = getNames(QLocale::HebrewScript);
127             break;
128         case QChar::Script_Arabic:
129             names = getNames(QLocale::ArabicScript);
130             break;
131         case QChar::Script_Syriac:
132             names = getNames(QLocale::SyriacScript);
133             break;
134         case QChar::Script_Thaana:
135             names = getNames(QLocale::ThaanaScript);
136             break;
137         case QChar::Script_Devanagari:
138             names = getNames(QLocale::DevanagariScript);
139             break;
140         case QChar::Script_Bengali:
141             names = getNames(QLocale::BengaliScript);
142             break;
143         case QChar::Script_Gurmukhi:
144             names = getNames(QLocale::GurmukhiScript);
145             break;
146         case QChar::Script_Gujarati:
147             names = getNames(QLocale::GujaratiScript);
148             break;
149         case QChar::Script_Oriya:
150             names = getNames(QLocale::OriyaScript);
151             break;
152         case QChar::Script_Tamil:
153             names = getNames(QLocale::TamilScript);
154             break;
155         case QChar::Script_Telugu:
156             names = getNames(QLocale::TeluguScript);
157             break;
158         case QChar::Script_Kannada:
159             names = getNames(QLocale::KannadaScript);
160             break;
161         case QChar::Script_Malayalam:
162             names = getNames(QLocale::MalayalamScript);
163             break;
164         case QChar::Script_Sinhala:
165             names = getNames(QLocale::SinhalaScript);
166             break;
167         case QChar::Script_Thai:
168             names = getNames(QLocale::ThaiScript);
169             break;
170         case QChar::Script_Lao:
171             names = getNames(QLocale::LaoScript);
172             break;
173         case QChar::Script_Tibetan:
174             names = getNames(QLocale::TibetanScript);
175             break;
176         case QChar::Script_Myanmar:
177             names = getNames(QLocale::MyanmarScript);
178             break;
179         case QChar::Script_Georgian:
180             names = getNames(QLocale::GeorgianScript);
181             break;
182         case QChar::Script_Hangul:
183             names = getNames(QLocale::HangulScript);
184             break;
185         case QChar::Script_Ethiopic:
186             names = getNames(QLocale::EthiopicScript);
187             break;
188         case QChar::Script_Cherokee:
189             names = getNames(QLocale::CherokeeScript);
190             break;
191         case QChar::Script_CanadianAboriginal:
192             names = getNames(QLocale::CanadianAboriginalScript);
193             break;
194         case QChar::Script_Ogham:
195             names = getNames(QLocale::OghamScript);
196             break;
197         case QChar::Script_Runic:
198             names = getNames(QLocale::RunicScript);
199             break;
200         case QChar::Script_Khmer:
201             names = getNames(QLocale::KhmerScript);
202             break;
203         case QChar::Script_Mongolian:
204             names = getNames(QLocale::MongolianScript);
205             break;
206         case QChar::Script_Hiragana:
207             names = getNames(QLocale::HiraganaScript);
208             break;
209         case QChar::Script_Katakana:
210             names = getNames(QLocale::KatakanaScript);
211             break;
212         case QChar::Script_Bopomofo:
213             names = getNames(QLocale::BopomofoScript);
214             break;
215         case QChar::Script_Han:
216             names = getNames(QLocale::HanScript);
217             break;
218         case QChar::Script_Yi:
219             names = getNames(QLocale::YiScript);
220             break;
221         case QChar::Script_OldItalic:
222             names = getNames(QLocale::OldItalicScript);
223             break;
224         case QChar::Script_Gothic:
225             names = getNames(QLocale::GothicScript);
226             break;
227         case QChar::Script_Deseret:
228             names = getNames(QLocale::DeseretScript);
229             break;
230         case QChar::Script_Tagalog:
231             names = getNames(QLocale::TagalogScript);
232             break;
233         case QChar::Script_Hanunoo:
234             names = getNames(QLocale::HanunooScript);
235             break;
236         case QChar::Script_Buhid:
237             names = getNames(QLocale::BuhidScript);
238             break;
239         case QChar::Script_Tagbanwa:
240             names = getNames(QLocale::TagbanwaScript);
241             break;
242         case QChar::Script_Coptic:
243             names = getNames(QLocale::CopticScript);
244             break;
245         case QChar::Script_Limbu:
246             names = getNames(QLocale::LimbuScript);
247             break;
248         case QChar::Script_TaiLe:
249             names = getNames(QLocale::TaiLeScript);
250             break;
251         case QChar::Script_LinearB:
252             names = getNames(QLocale::LinearBScript);
253             break;
254         case QChar::Script_Ugaritic:
255             names = getNames(QLocale::UgariticScript);
256             break;
257         case QChar::Script_Shavian:
258             names = getNames(QLocale::ShavianScript);
259             break;
260         case QChar::Script_Osmanya:
261             names = getNames(QLocale::OsmanyaScript);
262             break;
263         case QChar::Script_Cypriot:
264             names = getNames(QLocale::CypriotScript);
265             break;
266         case QChar::Script_Braille:
267             names = getNames(QLocale::BrailleScript);
268             break;
269         case QChar::Script_Buginese:
270             names = getNames(QLocale::BugineseScript);
271             break;
272         case QChar::Script_NewTaiLue:
273             names = getNames(QLocale::NewTaiLueScript);
274             break;
275         case QChar::Script_Glagolitic:
276             names = getNames(QLocale::GlagoliticScript);
277             break;
278         case QChar::Script_Tifinagh:
279             names = getNames(QLocale::TifinaghScript);
280             break;
281         case QChar::Script_SylotiNagri:
282             names = getNames(QLocale::SylotiNagriScript);
283             break;
284         case QChar::Script_OldPersian:
285             names = getNames(QLocale::OldPersianScript);
286             break;
287         case QChar::Script_Kharoshthi:
288             names = getNames(QLocale::KharoshthiScript);
289             break;
290         case QChar::Script_Balinese:
291             names = getNames(QLocale::BalineseScript);
292             break;
293         case QChar::Script_Cuneiform:
294             names = getNames(QLocale::CuneiformScript);
295             break;
296         case QChar::Script_Phoenician:
297             names = getNames(QLocale::PhoenicianScript);
298             break;
299         case QChar::Script_PhagsPa:
300             names = getNames(QLocale::PhagsPaScript);
301             break;
302         case QChar::Script_Nko:
303             names = getNames(QLocale::NkoScript);
304             break;
305         case QChar::Script_Sundanese:
306             names = getNames(QLocale::SundaneseScript);
307             break;
308         case QChar::Script_Lepcha:
309             names = getNames(QLocale::LepchaScript);
310             break;
311         case QChar::Script_OlChiki:
312             names = getNames(QLocale::OlChikiScript);
313             break;
314         case QChar::Script_Vai:
315             names = getNames(QLocale::VaiScript);
316             break;
317         case QChar::Script_Saurashtra:
318             names = getNames(QLocale::SaurashtraScript);
319             break;
320         case QChar::Script_KayahLi:
321             names = getNames(QLocale::KayahLiScript);
322             break;
323         case QChar::Script_Rejang:
324             names = getNames(QLocale::RejangScript);
325             break;
326         case QChar::Script_Lycian:
327             names = getNames(QLocale::LycianScript);
328             break;
329         case QChar::Script_Carian:
330             names = getNames(QLocale::CarianScript);
331             break;
332         case QChar::Script_Lydian:
333             names = getNames(QLocale::LydianScript);
334             break;
335         case QChar::Script_Cham:
336             names = getNames(QLocale::ChamScript);
337             break;
338         case QChar::Script_TaiTham:
339             names = getNames(QLocale::LannaScript);
340             break;
341         case QChar::Script_TaiViet:
342             names = getNames(QLocale::TaiVietScript);
343             break;
344         case QChar::Script_Avestan:
345             names = getNames(QLocale::AvestanScript);
346             break;
347         case QChar::Script_EgyptianHieroglyphs:
348             names = getNames(QLocale::EgyptianHieroglyphsScript);
349             break;
350         case QChar::Script_Samaritan:
351             names = getNames(QLocale::SamaritanScript);
352             break;
353         case QChar::Script_Lisu:
354             names = getNames(QLocale::FraserScript);
355             break;
356         case QChar::Script_Bamum:
357             names = getNames(QLocale::BamumScript);
358             break;
359         case QChar::Script_Javanese:
360             names = getNames(QLocale::JavaneseScript);
361             break;
362         case QChar::Script_MeeteiMayek:
363             names = getNames(QLocale::MeiteiMayekScript);
364             break;
365         case QChar::Script_ImperialAramaic:
366             names = getNames(QLocale::ImperialAramaicScript);
367             break;
368         case QChar::Script_OldSouthArabian:
369             names = getNames(QLocale::OldSouthArabianScript);
370             break;
371         case QChar::Script_InscriptionalParthian:
372             names = getNames(QLocale::InscriptionalParthianScript);
373             break;
374         case QChar::Script_InscriptionalPahlavi:
375             names = getNames(QLocale::InscriptionalPahlaviScript);
376             break;
377         case QChar::Script_Kaithi:
378             names = getNames(QLocale::KaithiScript);
379             break;
380         case QChar::Script_Batak:
381             names = getNames(QLocale::BatakScript);
382             break;
383         case QChar::Script_Brahmi:
384             names = getNames(QLocale::BrahmiScript);
385             break;
386         case QChar::Script_Mandaic:
387             names = getNames(QLocale::MandaeanScript);
388             break;
389         case QChar::Script_Chakma:
390             names = getNames(QLocale::ChakmaScript);
391             break;
392         case QChar::Script_MeroiticCursive:
393         case QChar::Script_MeroiticHieroglyphs:
394             names = getNames(QLocale::MeroiticCursiveScript);
395             names.append(getNames(QLocale::MeroiticScript));
396             break;
397         case QChar::Script_Miao:
398             names = getNames(QLocale::PollardPhoneticScript);
399             break;
400         case QChar::Script_Sharada:
401             names = getNames(QLocale::SharadaScript);
402             break;
403         case QChar::Script_SoraSompeng:
404             names = getNames(QLocale::SoraSompengScript);
405             break;
406         case QChar::Script_Takri:
407             names = getNames(QLocale::TakriScript);
408             break;
409         case QChar::Script_CaucasianAlbanian:
410             names = getNames(QLocale::CaucasianAlbanianScript);
411             break;
412         case QChar::Script_BassaVah:
413             names = getNames(QLocale::BassaVahScript);
414             break;
415         case QChar::Script_Duployan:
416             names = getNames(QLocale::DuployanScript);
417             break;
418         case QChar::Script_Elbasan:
419             names = getNames(QLocale::ElbasanScript);
420             break;
421         case QChar::Script_Grantha:
422             names = getNames(QLocale::GranthaScript);
423             break;
424         case QChar::Script_PahawhHmong:
425             names = getNames(QLocale::PahawhHmongScript);
426             break;
427         case QChar::Script_Khojki:
428             names = getNames(QLocale::KhojkiScript);
429             break;
430         case QChar::Script_LinearA:
431             names = getNames(QLocale::LinearAScript);
432             break;
433         case QChar::Script_Mahajani:
434             names = getNames(QLocale::MahajaniScript);
435             break;
436         case QChar::Script_Manichaean:
437             names = getNames(QLocale::ManichaeanScript);
438             break;
439         case QChar::Script_MendeKikakui:
440             names = getNames(QLocale::MendeKikakuiScript);
441             break;
442         case QChar::Script_Modi:
443             names = getNames(QLocale::ModiScript);
444             break;
445         case QChar::Script_Mro:
446             names = getNames(QLocale::MroScript);
447             break;
448         case QChar::Script_OldNorthArabian:
449             names = getNames(QLocale::OldNorthArabianScript);
450             break;
451         case QChar::Script_Nabataean:
452             names = getNames(QLocale::NabataeanScript);
453             break;
454         case QChar::Script_Palmyrene:
455             names = getNames(QLocale::PalmyreneScript);
456             break;
457         case QChar::Script_PauCinHau:
458             names = getNames(QLocale::PauCinHauScript);
459             break;
460         case QChar::Script_OldPermic:
461             names = getNames(QLocale::OldPermicScript);
462             break;
463         case QChar::Script_PsalterPahlavi:
464             names = getNames(QLocale::PsalterPahlaviScript);
465             break;
466         case QChar::Script_Siddham:
467             names = getNames(QLocale::SiddhamScript);
468             break;
469         case QChar::Script_Khudawadi:
470             names = getNames(QLocale::KhudawadiScript);
471             break;
472         case QChar::Script_Tirhuta:
473             names = getNames(QLocale::TirhutaScript);
474             break;
475         case QChar::Script_WarangCiti:
476             names = getNames(QLocale::VarangKshitiScript);
477             break;
478         case QChar::Script_Ahom:
479             names = getNames(QLocale::AhomScript);
480             break;
481         case QChar::Script_AnatolianHieroglyphs:
482             names = getNames(QLocale::AnatolianHieroglyphsScript);
483             break;
484         case QChar::Script_Hatran:
485             names = getNames(QLocale::HatranScript);
486             break;
487         case QChar::Script_Multani:
488             names = getNames(QLocale::MultaniScript);
489             break;
490         case QChar::Script_OldHungarian:
491             names = getNames(QLocale::OldHungarianScript);
492             break;
493         case QChar::Script_Unknown:
494         case QChar::Script_Inherited:
495         case QChar::Script_Common:
496         case QChar::Script_OldTurkic:
497         case QChar::Script_SignWriting:
498             break;
499         default:
500             qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script;
501             break;
502         }
503         allLanguages.unite(QSet<QString>(names.constBegin(), names.constEnd()));
504 
505         { // Remove unknown languages
506             QStringList pruned;
507             for (const QString &name : std::as_const(names)) {
508                 if (!dictionaryLanguages.contains(name)) {
509                     continue;
510                 }
511                 pruned.append(name);
512             }
513             names = pruned;
514         }
515 
516         if (names.isEmpty()) {
517             continue;
518         }
519 
520         for (const QString &name : std::as_const(names)) {
521             s_scriptLanguages.insert(script, name);
522         }
523     }
524 
525     // Try to handle some badly named dictionaries
526     if (!allLanguages.contains(s_knownDictionaries)) {
527         QSet<QString> dicts(s_knownDictionaries);
528         dicts.subtract(allLanguages);
529         for (const QString &dictName : std::as_const(dicts)) {
530             QString languageName = QLocale(dictName).name();
531             if (languageName.isEmpty()) {
532                 qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName;
533                 continue;
534             }
535             s_dictionaryNameMap[languageName] = dictName;
536             if (std::find(s_scriptLanguages.cbegin(), s_scriptLanguages.cend(), languageName) == s_scriptLanguages.cend()) {
537                 qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName;
538             }
539         }
540     }
541 }
542 
GuessLanguage()543 GuessLanguage::GuessLanguage()
544     : d(new GuessLanguagePrivate)
545 {
546 }
547 
~GuessLanguage()548 GuessLanguage::~GuessLanguage()
549 {
550     delete d;
551 }
552 
identify(const QString & text,const QStringList & suggestionsListIn) const553 QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const
554 {
555     if (text.isEmpty()) {
556         return QString();
557     }
558 
559     // Filter for available dictionaries
560     QStringList suggestionsList;
561     for (const QString &suggestion : suggestionsListIn) {
562         if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.contains(suggestion)) {
563             suggestionsList.append(suggestion);
564         }
565     }
566 
567     // Load the model on demand
568     if (d->s_knownModels.isEmpty()) {
569         d->loadModels();
570     }
571 
572     const QList<QChar::Script> scriptsList = d->findRuns(text);
573 
574     QStringList candidateLanguages = d->identify(text, scriptsList);
575 
576     // if guessing from trigrams fail
577     for (const QChar::Script script : scriptsList) {
578         const auto languagesList = d->s_scriptLanguages.values(script);
579         for (const QString &lang : languagesList) {
580             if (!d->s_knownModels.contains(lang)) {
581                 candidateLanguages.append(lang);
582             }
583         }
584     }
585 
586     // Hack for some bad dictionary names
587     for (int i = 0; i < candidateLanguages.count(); i++) {
588         if (d->s_dictionaryNameMap.contains(candidateLanguages[i])) {
589             candidateLanguages[i] = d->s_dictionaryNameMap.value(candidateLanguages[i]);
590         }
591     }
592 
593     if (candidateLanguages.count() == 1) {
594         return candidateLanguages.first();
595     }
596 
597     // Wasn't able to get a good guess with the trigrams, try checking all
598     // dictionaries for the suggested languages.
599     candidateLanguages.append(suggestionsList);
600     candidateLanguages.removeDuplicates();
601     QString identified = d->guessFromDictionaries(text, candidateLanguages);
602     if (!identified.isEmpty()) {
603         return identified;
604     }
605 
606     qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text;
607 
608     // None of our methods worked, just return the best suggestion
609     if (!suggestionsList.isEmpty()) {
610         return suggestionsList.first();
611     }
612 
613     qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text;
614 
615     // Not even any suggestions, give up
616     return QString();
617 }
618 
setLimits(int maxItems,double minConfidence)619 void GuessLanguage::setLimits(int maxItems, double minConfidence)
620 {
621     d->m_maxItems = maxItems;
622     d->m_minConfidence = minConfidence;
623 }
624 
loadModels()625 void GuessLanguagePrivate::loadModels()
626 {
627     // use trigrams from resource file, easy to deploy on all platforms
628     const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map");
629     qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile;
630 
631     QFile sin(triMapFile);
632     if (!sin.open(QIODevice::ReadOnly)) {
633         qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile;
634         return;
635     }
636 
637     QDataStream in(&sin);
638     in >> s_knownModels;
639 
640     // Sanity check
641     QSet<QString> availableLanguages;
642     QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels);
643     while (iterator.hasNext()) {
644         iterator.next();
645         if (iterator.value().count() < MAXGRAMS) {
646             qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS;
647         }
648         availableLanguages.insert(iterator.key());
649     }
650     QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd());
651     knownLanguages.subtract(availableLanguages);
652     if (!knownLanguages.isEmpty()) {
653         qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages;
654     }
655 }
656 
findRuns(const QString & text)657 QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text)
658 {
659     QHash<QChar::Script, int> scriptCounts;
660 
661     int totalCount = 0;
662 
663     for (const QChar c : text) {
664         const QChar::Script script = c.script();
665 
666         if (script == QChar::Script_Common || script == QChar::Script_Inherited) {
667             continue;
668         }
669 
670         if (!c.isLetter()) {
671             continue;
672         }
673 
674         scriptCounts[script]++;
675         totalCount++;
676     }
677 
678     QList<QChar::Script> relevantScripts;
679 
680     if (totalCount == 0) {
681         return relevantScripts;
682     }
683 
684     if (scriptCounts.size() == 1) {
685         return {scriptCounts.cbegin().key()};
686     }
687 
688     for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) {
689         // return run types that used for 40% or more of the string
690         const int scriptCount = it.value();
691         const auto currentScript = it.key();
692         if (scriptCount * 100 / totalCount >= 40) {
693             relevantScripts << currentScript;
694             // always return basic latin if found more than 15%.
695         } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) {
696             relevantScripts << currentScript;
697         }
698     }
699 
700     return relevantScripts;
701 }
702 
identify(const QString & sample,const QList<QChar::Script> & scripts)703 QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts)
704 {
705     if (sample.size() < MIN_LENGTH) {
706         return QStringList();
707     }
708 
709     QStringList guesses;
710     for (const QChar::Script script : scripts) {
711         guesses.append(guessFromTrigrams(sample, s_scriptLanguages.values(script)));
712     }
713 
714     return guesses;
715 }
716 
guessFromTrigrams(const QString & sample,const QStringList & languages)717 QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages)
718 {
719     QStringList ret;
720 
721     const QVector<QString> sampleTrigrams = createOrderedModel(sample);
722 
723     // Sort by score
724     QMultiMap<int, QString> scores;
725     for (const QString &language : languages) {
726         if (s_knownModels.contains(language)) {
727             scores.insert(distance(sampleTrigrams, s_knownModels[language]), language);
728         }
729     }
730 
731     // Skip if either no results or best result is completely unknown (distance >= maxdistance)
732     if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) {
733         qCDebug(SONNET_LOG_CORE) << "No scores for" << sample;
734         return ret;
735     }
736 
737     int counter = 0;
738     double confidence = 0;
739     QMapIterator<int, QString> it(scores);
740     it.next();
741 
742     QString prevItem = it.value();
743     int prevScore = it.key();
744 
745     while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) {
746         it.next();
747         counter++;
748         confidence += (it.key() - prevScore) / (double)it.key();
749         ret += prevItem;
750         prevItem = it.value();
751         prevScore = it.key();
752     }
753     if (counter < m_maxItems && confidence < m_minConfidence) {
754         ret += prevItem;
755     }
756 
757     return ret;
758 }
759 
createOrderedModel(const QString & content)760 QVector<QString> GuessLanguagePrivate::createOrderedModel(const QString &content)
761 {
762     QHash<QString, int> trigramCounts;
763 
764     // collect trigrams
765     trigramCounts.reserve(content.size() - 2);
766     for (int i = 0; i < (content.size() - 2); ++i) {
767         QString tri = content.mid(i, 3).toLower();
768         trigramCounts[tri]++;
769     }
770 
771     // invert the map <freq, trigram>
772     QVector<QPair<int, QString>> trigramFrequencyList;
773     trigramFrequencyList.reserve(trigramCounts.size());
774 
775     auto it = trigramCounts.constBegin();
776     for (; it != trigramCounts.constEnd(); ++it) {
777         const QChar *data = it.key().constData();
778         bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace()));
779 
780         if (!hasTwoSpaces) {
781             const int freq = it.value();
782             const QString &trigram = it.key();
783             trigramFrequencyList.append({freq, trigram});
784         }
785     }
786 
787     // sort descending by frequency
788     std::sort(trigramFrequencyList.begin(), trigramFrequencyList.end(), [](const QPair<int, QString> &a, const QPair<int, QString> &b) {
789         return a.first > b.first;
790     });
791 
792     QVector<QString> orderedTrigrams;
793     orderedTrigrams.reserve(trigramFrequencyList.size());
794     for (const auto &tri : std::as_const(trigramFrequencyList)) {
795         orderedTrigrams.append(tri.second);
796     }
797 
798     return orderedTrigrams;
799 }
800 
distance(const QVector<QString> & model,const QHash<QString,int> & knownModel)801 int GuessLanguagePrivate::distance(const QVector<QString> &model, const QHash<QString, int> &knownModel)
802 {
803     int counter = -1;
804     int dist = 0;
805 
806     for (const QString &trigram : model) {
807         const int val = knownModel.value(trigram, -1);
808         if (val != -1) {
809             dist += qAbs(++counter - val);
810         } else {
811             dist += MAXGRAMS;
812         }
813 
814         if (counter == (MAXGRAMS - 1)) {
815             break;
816         }
817     }
818 
819     return dist;
820 }
821 
guessFromDictionaries(const QString & sentence,const QStringList & candidates)822 QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates)
823 {
824     // Try to see how many languages we can get spell checking for
825     QList<QSharedPointer<SpellerPlugin>> spellers;
826     for (const QString &lang : candidates) {
827         if (!Loader::openLoader()->languages().contains(lang)) {
828             qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang;
829             continue;
830         }
831         QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(lang);
832         if (!plugin.isNull()) {
833             spellers.append(plugin);
834         }
835     }
836 
837     // If there's no spell checkers, give up
838     if (spellers.isEmpty()) {
839         return QString();
840     }
841 
842     QMap<QString, int> correctHits;
843 
844     WordTokenizer tokenizer(sentence);
845     while (tokenizer.hasNext()) {
846         Token word = tokenizer.next();
847         if (!tokenizer.isSpellcheckable()) {
848             continue;
849         }
850 
851         for (int i = 0; i < spellers.count(); ++i) {
852             if (spellers[i]->isCorrect(word.toString())) {
853                 correctHits[spellers[i]->language()]++;
854             }
855         }
856     }
857 
858     if (correctHits.isEmpty()) {
859         return QString();
860     }
861 
862     QMap<QString, int>::const_iterator max = correctHits.constBegin();
863     for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) {
864         if (itr.value() > max.value()) {
865             max = itr;
866         }
867     }
868     return max.key();
869 }
870 }
871