1 /* This file is part of the KDE libraries
2 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
3 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
4 SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org>
5
6 SPDX-License-Identifier: LGPL-2.0-or-later
7 */
8
9 #include <QCoreApplication>
10 #include <QDataStream>
11 #include <QFile>
12 #include <QLocale>
13 #include <QStandardPaths>
14
15 #include "core_debug.h"
16 #include "guesslanguage.h"
17 #include "loader_p.h"
18 #include "speller.h"
19 #include "spellerplugin_p.h"
20 #include "tokenizer_p.h"
21
22 /*
23 All language tags should be valid according to IETF BCP 47, as codified in RFC 4646.
24 ISO 639-1 codes should be used for the language part except for cases where there
25 exists no code, then 639-3 codes should be used. Country codes should only be used
26 in special cases. Scripts can be differentiated by IANA subtags, available here:
27 http://www.iana.org/assignments/language-subtag-registry
28 The script tags correspond to ISO 15924
29
30 An overview of the best practices concerning language tagging is available here:
31 http://www.w3.org/International/articles/language-tags/Overview.en.php
32
33 lang tags should use underscores (_) rather than hyphens (-) to separate subsections.
34
35 EXCEPTIONS:
36 For cases of known differences from the above tagging scheme and major
37 spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers
38 shall be used. All exception shall be noted here:
39
40 BCP SPELLCHECK
41 az-Latn az
42
43 */
44
45 namespace Sonnet
46 {
47 class GuessLanguagePrivate
48 {
49 public:
50 GuessLanguagePrivate();
51 // language trigram score
52 static QHash<QString, QHash<QString, int>> s_knownModels;
53
54 void loadModels();
55 QList<QChar::Script> findRuns(const QString &text);
56 QVector<QString> createOrderedModel(const QString &content);
57 int distance(const QVector<QString> &model, const QHash<QString, int> &knownModel);
58 QStringList guessFromTrigrams(const QString &sample, const QStringList &langs);
59 QStringList identify(const QString &sample, const QList<QChar::Script> &scripts);
60 QString guessFromDictionaries(const QString &sentence, const QStringList &candidates);
61
62 static QSet<QString> s_knownDictionaries;
63 static QMultiHash<QChar::Script, QString> s_scriptLanguages;
64 static QMap<QString, QString> s_dictionaryNameMap;
65
66 const int MIN_LENGTH;
67 int m_maxItems;
68 double m_minConfidence;
69 };
70
71 QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels;
72 QSet<QString> GuessLanguagePrivate::s_knownDictionaries;
73 QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages;
74 QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap;
75
getNames(QLocale::Script script)76 QStringList getNames(QLocale::Script script)
77 {
78 QStringList locales;
79 const auto matchingLocales = QLocale::matchingLocales(QLocale::AnyLanguage, script, QLocale::AnyCountry);
80 locales.reserve(matchingLocales.size());
81 for (const QLocale &locale : matchingLocales) {
82 locales << locale.name();
83 }
84 return locales;
85 }
86
GuessLanguagePrivate()87 GuessLanguagePrivate::GuessLanguagePrivate()
88 : MIN_LENGTH(5)
89 , m_maxItems(1)
90 , m_minConfidence(0)
91 {
92 if (!s_scriptLanguages.isEmpty()) {
93 return;
94 }
95
96 const QStringList languages = Loader::openLoader()->languages();
97 s_knownDictionaries = QSet<QString>(languages.begin(), languages.end());
98 QSet<QString> dictionaryLanguages;
99 for (const QString &dictName : std::as_const(s_knownDictionaries)) {
100 QString languageName = QLocale(dictName).name();
101 if (languageName.isEmpty()) {
102 qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName;
103 continue;
104 }
105 dictionaryLanguages.insert(languageName);
106 }
107
108 QSet<QString> allLanguages;
109 for (int i = 0; i < int(QChar::ScriptCount); i++) {
110 QChar::Script script = static_cast<QChar::Script>(i);
111 QStringList names;
112 switch (script) {
113 case QChar::Script_Latin:
114 names = getNames(QLocale::LatinScript);
115 break;
116 case QChar::Script_Greek:
117 names = getNames(QLocale::GreekScript);
118 break;
119 case QChar::Script_Cyrillic:
120 names = getNames(QLocale::CyrillicScript);
121 break;
122 case QChar::Script_Armenian:
123 names = getNames(QLocale::ArmenianScript);
124 break;
125 case QChar::Script_Hebrew:
126 names = getNames(QLocale::HebrewScript);
127 break;
128 case QChar::Script_Arabic:
129 names = getNames(QLocale::ArabicScript);
130 break;
131 case QChar::Script_Syriac:
132 names = getNames(QLocale::SyriacScript);
133 break;
134 case QChar::Script_Thaana:
135 names = getNames(QLocale::ThaanaScript);
136 break;
137 case QChar::Script_Devanagari:
138 names = getNames(QLocale::DevanagariScript);
139 break;
140 case QChar::Script_Bengali:
141 names = getNames(QLocale::BengaliScript);
142 break;
143 case QChar::Script_Gurmukhi:
144 names = getNames(QLocale::GurmukhiScript);
145 break;
146 case QChar::Script_Gujarati:
147 names = getNames(QLocale::GujaratiScript);
148 break;
149 case QChar::Script_Oriya:
150 names = getNames(QLocale::OriyaScript);
151 break;
152 case QChar::Script_Tamil:
153 names = getNames(QLocale::TamilScript);
154 break;
155 case QChar::Script_Telugu:
156 names = getNames(QLocale::TeluguScript);
157 break;
158 case QChar::Script_Kannada:
159 names = getNames(QLocale::KannadaScript);
160 break;
161 case QChar::Script_Malayalam:
162 names = getNames(QLocale::MalayalamScript);
163 break;
164 case QChar::Script_Sinhala:
165 names = getNames(QLocale::SinhalaScript);
166 break;
167 case QChar::Script_Thai:
168 names = getNames(QLocale::ThaiScript);
169 break;
170 case QChar::Script_Lao:
171 names = getNames(QLocale::LaoScript);
172 break;
173 case QChar::Script_Tibetan:
174 names = getNames(QLocale::TibetanScript);
175 break;
176 case QChar::Script_Myanmar:
177 names = getNames(QLocale::MyanmarScript);
178 break;
179 case QChar::Script_Georgian:
180 names = getNames(QLocale::GeorgianScript);
181 break;
182 case QChar::Script_Hangul:
183 names = getNames(QLocale::HangulScript);
184 break;
185 case QChar::Script_Ethiopic:
186 names = getNames(QLocale::EthiopicScript);
187 break;
188 case QChar::Script_Cherokee:
189 names = getNames(QLocale::CherokeeScript);
190 break;
191 case QChar::Script_CanadianAboriginal:
192 names = getNames(QLocale::CanadianAboriginalScript);
193 break;
194 case QChar::Script_Ogham:
195 names = getNames(QLocale::OghamScript);
196 break;
197 case QChar::Script_Runic:
198 names = getNames(QLocale::RunicScript);
199 break;
200 case QChar::Script_Khmer:
201 names = getNames(QLocale::KhmerScript);
202 break;
203 case QChar::Script_Mongolian:
204 names = getNames(QLocale::MongolianScript);
205 break;
206 case QChar::Script_Hiragana:
207 names = getNames(QLocale::HiraganaScript);
208 break;
209 case QChar::Script_Katakana:
210 names = getNames(QLocale::KatakanaScript);
211 break;
212 case QChar::Script_Bopomofo:
213 names = getNames(QLocale::BopomofoScript);
214 break;
215 case QChar::Script_Han:
216 names = getNames(QLocale::HanScript);
217 break;
218 case QChar::Script_Yi:
219 names = getNames(QLocale::YiScript);
220 break;
221 case QChar::Script_OldItalic:
222 names = getNames(QLocale::OldItalicScript);
223 break;
224 case QChar::Script_Gothic:
225 names = getNames(QLocale::GothicScript);
226 break;
227 case QChar::Script_Deseret:
228 names = getNames(QLocale::DeseretScript);
229 break;
230 case QChar::Script_Tagalog:
231 names = getNames(QLocale::TagalogScript);
232 break;
233 case QChar::Script_Hanunoo:
234 names = getNames(QLocale::HanunooScript);
235 break;
236 case QChar::Script_Buhid:
237 names = getNames(QLocale::BuhidScript);
238 break;
239 case QChar::Script_Tagbanwa:
240 names = getNames(QLocale::TagbanwaScript);
241 break;
242 case QChar::Script_Coptic:
243 names = getNames(QLocale::CopticScript);
244 break;
245 case QChar::Script_Limbu:
246 names = getNames(QLocale::LimbuScript);
247 break;
248 case QChar::Script_TaiLe:
249 names = getNames(QLocale::TaiLeScript);
250 break;
251 case QChar::Script_LinearB:
252 names = getNames(QLocale::LinearBScript);
253 break;
254 case QChar::Script_Ugaritic:
255 names = getNames(QLocale::UgariticScript);
256 break;
257 case QChar::Script_Shavian:
258 names = getNames(QLocale::ShavianScript);
259 break;
260 case QChar::Script_Osmanya:
261 names = getNames(QLocale::OsmanyaScript);
262 break;
263 case QChar::Script_Cypriot:
264 names = getNames(QLocale::CypriotScript);
265 break;
266 case QChar::Script_Braille:
267 names = getNames(QLocale::BrailleScript);
268 break;
269 case QChar::Script_Buginese:
270 names = getNames(QLocale::BugineseScript);
271 break;
272 case QChar::Script_NewTaiLue:
273 names = getNames(QLocale::NewTaiLueScript);
274 break;
275 case QChar::Script_Glagolitic:
276 names = getNames(QLocale::GlagoliticScript);
277 break;
278 case QChar::Script_Tifinagh:
279 names = getNames(QLocale::TifinaghScript);
280 break;
281 case QChar::Script_SylotiNagri:
282 names = getNames(QLocale::SylotiNagriScript);
283 break;
284 case QChar::Script_OldPersian:
285 names = getNames(QLocale::OldPersianScript);
286 break;
287 case QChar::Script_Kharoshthi:
288 names = getNames(QLocale::KharoshthiScript);
289 break;
290 case QChar::Script_Balinese:
291 names = getNames(QLocale::BalineseScript);
292 break;
293 case QChar::Script_Cuneiform:
294 names = getNames(QLocale::CuneiformScript);
295 break;
296 case QChar::Script_Phoenician:
297 names = getNames(QLocale::PhoenicianScript);
298 break;
299 case QChar::Script_PhagsPa:
300 names = getNames(QLocale::PhagsPaScript);
301 break;
302 case QChar::Script_Nko:
303 names = getNames(QLocale::NkoScript);
304 break;
305 case QChar::Script_Sundanese:
306 names = getNames(QLocale::SundaneseScript);
307 break;
308 case QChar::Script_Lepcha:
309 names = getNames(QLocale::LepchaScript);
310 break;
311 case QChar::Script_OlChiki:
312 names = getNames(QLocale::OlChikiScript);
313 break;
314 case QChar::Script_Vai:
315 names = getNames(QLocale::VaiScript);
316 break;
317 case QChar::Script_Saurashtra:
318 names = getNames(QLocale::SaurashtraScript);
319 break;
320 case QChar::Script_KayahLi:
321 names = getNames(QLocale::KayahLiScript);
322 break;
323 case QChar::Script_Rejang:
324 names = getNames(QLocale::RejangScript);
325 break;
326 case QChar::Script_Lycian:
327 names = getNames(QLocale::LycianScript);
328 break;
329 case QChar::Script_Carian:
330 names = getNames(QLocale::CarianScript);
331 break;
332 case QChar::Script_Lydian:
333 names = getNames(QLocale::LydianScript);
334 break;
335 case QChar::Script_Cham:
336 names = getNames(QLocale::ChamScript);
337 break;
338 case QChar::Script_TaiTham:
339 names = getNames(QLocale::LannaScript);
340 break;
341 case QChar::Script_TaiViet:
342 names = getNames(QLocale::TaiVietScript);
343 break;
344 case QChar::Script_Avestan:
345 names = getNames(QLocale::AvestanScript);
346 break;
347 case QChar::Script_EgyptianHieroglyphs:
348 names = getNames(QLocale::EgyptianHieroglyphsScript);
349 break;
350 case QChar::Script_Samaritan:
351 names = getNames(QLocale::SamaritanScript);
352 break;
353 case QChar::Script_Lisu:
354 names = getNames(QLocale::FraserScript);
355 break;
356 case QChar::Script_Bamum:
357 names = getNames(QLocale::BamumScript);
358 break;
359 case QChar::Script_Javanese:
360 names = getNames(QLocale::JavaneseScript);
361 break;
362 case QChar::Script_MeeteiMayek:
363 names = getNames(QLocale::MeiteiMayekScript);
364 break;
365 case QChar::Script_ImperialAramaic:
366 names = getNames(QLocale::ImperialAramaicScript);
367 break;
368 case QChar::Script_OldSouthArabian:
369 names = getNames(QLocale::OldSouthArabianScript);
370 break;
371 case QChar::Script_InscriptionalParthian:
372 names = getNames(QLocale::InscriptionalParthianScript);
373 break;
374 case QChar::Script_InscriptionalPahlavi:
375 names = getNames(QLocale::InscriptionalPahlaviScript);
376 break;
377 case QChar::Script_Kaithi:
378 names = getNames(QLocale::KaithiScript);
379 break;
380 case QChar::Script_Batak:
381 names = getNames(QLocale::BatakScript);
382 break;
383 case QChar::Script_Brahmi:
384 names = getNames(QLocale::BrahmiScript);
385 break;
386 case QChar::Script_Mandaic:
387 names = getNames(QLocale::MandaeanScript);
388 break;
389 case QChar::Script_Chakma:
390 names = getNames(QLocale::ChakmaScript);
391 break;
392 case QChar::Script_MeroiticCursive:
393 case QChar::Script_MeroiticHieroglyphs:
394 names = getNames(QLocale::MeroiticCursiveScript);
395 names.append(getNames(QLocale::MeroiticScript));
396 break;
397 case QChar::Script_Miao:
398 names = getNames(QLocale::PollardPhoneticScript);
399 break;
400 case QChar::Script_Sharada:
401 names = getNames(QLocale::SharadaScript);
402 break;
403 case QChar::Script_SoraSompeng:
404 names = getNames(QLocale::SoraSompengScript);
405 break;
406 case QChar::Script_Takri:
407 names = getNames(QLocale::TakriScript);
408 break;
409 case QChar::Script_CaucasianAlbanian:
410 names = getNames(QLocale::CaucasianAlbanianScript);
411 break;
412 case QChar::Script_BassaVah:
413 names = getNames(QLocale::BassaVahScript);
414 break;
415 case QChar::Script_Duployan:
416 names = getNames(QLocale::DuployanScript);
417 break;
418 case QChar::Script_Elbasan:
419 names = getNames(QLocale::ElbasanScript);
420 break;
421 case QChar::Script_Grantha:
422 names = getNames(QLocale::GranthaScript);
423 break;
424 case QChar::Script_PahawhHmong:
425 names = getNames(QLocale::PahawhHmongScript);
426 break;
427 case QChar::Script_Khojki:
428 names = getNames(QLocale::KhojkiScript);
429 break;
430 case QChar::Script_LinearA:
431 names = getNames(QLocale::LinearAScript);
432 break;
433 case QChar::Script_Mahajani:
434 names = getNames(QLocale::MahajaniScript);
435 break;
436 case QChar::Script_Manichaean:
437 names = getNames(QLocale::ManichaeanScript);
438 break;
439 case QChar::Script_MendeKikakui:
440 names = getNames(QLocale::MendeKikakuiScript);
441 break;
442 case QChar::Script_Modi:
443 names = getNames(QLocale::ModiScript);
444 break;
445 case QChar::Script_Mro:
446 names = getNames(QLocale::MroScript);
447 break;
448 case QChar::Script_OldNorthArabian:
449 names = getNames(QLocale::OldNorthArabianScript);
450 break;
451 case QChar::Script_Nabataean:
452 names = getNames(QLocale::NabataeanScript);
453 break;
454 case QChar::Script_Palmyrene:
455 names = getNames(QLocale::PalmyreneScript);
456 break;
457 case QChar::Script_PauCinHau:
458 names = getNames(QLocale::PauCinHauScript);
459 break;
460 case QChar::Script_OldPermic:
461 names = getNames(QLocale::OldPermicScript);
462 break;
463 case QChar::Script_PsalterPahlavi:
464 names = getNames(QLocale::PsalterPahlaviScript);
465 break;
466 case QChar::Script_Siddham:
467 names = getNames(QLocale::SiddhamScript);
468 break;
469 case QChar::Script_Khudawadi:
470 names = getNames(QLocale::KhudawadiScript);
471 break;
472 case QChar::Script_Tirhuta:
473 names = getNames(QLocale::TirhutaScript);
474 break;
475 case QChar::Script_WarangCiti:
476 names = getNames(QLocale::VarangKshitiScript);
477 break;
478 case QChar::Script_Ahom:
479 names = getNames(QLocale::AhomScript);
480 break;
481 case QChar::Script_AnatolianHieroglyphs:
482 names = getNames(QLocale::AnatolianHieroglyphsScript);
483 break;
484 case QChar::Script_Hatran:
485 names = getNames(QLocale::HatranScript);
486 break;
487 case QChar::Script_Multani:
488 names = getNames(QLocale::MultaniScript);
489 break;
490 case QChar::Script_OldHungarian:
491 names = getNames(QLocale::OldHungarianScript);
492 break;
493 case QChar::Script_Unknown:
494 case QChar::Script_Inherited:
495 case QChar::Script_Common:
496 case QChar::Script_OldTurkic:
497 case QChar::Script_SignWriting:
498 break;
499 default:
500 qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script;
501 break;
502 }
503 allLanguages.unite(QSet<QString>(names.constBegin(), names.constEnd()));
504
505 { // Remove unknown languages
506 QStringList pruned;
507 for (const QString &name : std::as_const(names)) {
508 if (!dictionaryLanguages.contains(name)) {
509 continue;
510 }
511 pruned.append(name);
512 }
513 names = pruned;
514 }
515
516 if (names.isEmpty()) {
517 continue;
518 }
519
520 for (const QString &name : std::as_const(names)) {
521 s_scriptLanguages.insert(script, name);
522 }
523 }
524
525 // Try to handle some badly named dictionaries
526 if (!allLanguages.contains(s_knownDictionaries)) {
527 QSet<QString> dicts(s_knownDictionaries);
528 dicts.subtract(allLanguages);
529 for (const QString &dictName : std::as_const(dicts)) {
530 QString languageName = QLocale(dictName).name();
531 if (languageName.isEmpty()) {
532 qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName;
533 continue;
534 }
535 s_dictionaryNameMap[languageName] = dictName;
536 if (std::find(s_scriptLanguages.cbegin(), s_scriptLanguages.cend(), languageName) == s_scriptLanguages.cend()) {
537 qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName;
538 }
539 }
540 }
541 }
542
GuessLanguage()543 GuessLanguage::GuessLanguage()
544 : d(new GuessLanguagePrivate)
545 {
546 }
547
~GuessLanguage()548 GuessLanguage::~GuessLanguage()
549 {
550 delete d;
551 }
552
identify(const QString & text,const QStringList & suggestionsListIn) const553 QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const
554 {
555 if (text.isEmpty()) {
556 return QString();
557 }
558
559 // Filter for available dictionaries
560 QStringList suggestionsList;
561 for (const QString &suggestion : suggestionsListIn) {
562 if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.contains(suggestion)) {
563 suggestionsList.append(suggestion);
564 }
565 }
566
567 // Load the model on demand
568 if (d->s_knownModels.isEmpty()) {
569 d->loadModels();
570 }
571
572 const QList<QChar::Script> scriptsList = d->findRuns(text);
573
574 QStringList candidateLanguages = d->identify(text, scriptsList);
575
576 // if guessing from trigrams fail
577 for (const QChar::Script script : scriptsList) {
578 const auto languagesList = d->s_scriptLanguages.values(script);
579 for (const QString &lang : languagesList) {
580 if (!d->s_knownModels.contains(lang)) {
581 candidateLanguages.append(lang);
582 }
583 }
584 }
585
586 // Hack for some bad dictionary names
587 for (int i = 0; i < candidateLanguages.count(); i++) {
588 if (d->s_dictionaryNameMap.contains(candidateLanguages[i])) {
589 candidateLanguages[i] = d->s_dictionaryNameMap.value(candidateLanguages[i]);
590 }
591 }
592
593 if (candidateLanguages.count() == 1) {
594 return candidateLanguages.first();
595 }
596
597 // Wasn't able to get a good guess with the trigrams, try checking all
598 // dictionaries for the suggested languages.
599 candidateLanguages.append(suggestionsList);
600 candidateLanguages.removeDuplicates();
601 QString identified = d->guessFromDictionaries(text, candidateLanguages);
602 if (!identified.isEmpty()) {
603 return identified;
604 }
605
606 qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text;
607
608 // None of our methods worked, just return the best suggestion
609 if (!suggestionsList.isEmpty()) {
610 return suggestionsList.first();
611 }
612
613 qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text;
614
615 // Not even any suggestions, give up
616 return QString();
617 }
618
setLimits(int maxItems,double minConfidence)619 void GuessLanguage::setLimits(int maxItems, double minConfidence)
620 {
621 d->m_maxItems = maxItems;
622 d->m_minConfidence = minConfidence;
623 }
624
loadModels()625 void GuessLanguagePrivate::loadModels()
626 {
627 // use trigrams from resource file, easy to deploy on all platforms
628 const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map");
629 qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile;
630
631 QFile sin(triMapFile);
632 if (!sin.open(QIODevice::ReadOnly)) {
633 qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile;
634 return;
635 }
636
637 QDataStream in(&sin);
638 in >> s_knownModels;
639
640 // Sanity check
641 QSet<QString> availableLanguages;
642 QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels);
643 while (iterator.hasNext()) {
644 iterator.next();
645 if (iterator.value().count() < MAXGRAMS) {
646 qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS;
647 }
648 availableLanguages.insert(iterator.key());
649 }
650 QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd());
651 knownLanguages.subtract(availableLanguages);
652 if (!knownLanguages.isEmpty()) {
653 qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages;
654 }
655 }
656
findRuns(const QString & text)657 QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text)
658 {
659 QHash<QChar::Script, int> scriptCounts;
660
661 int totalCount = 0;
662
663 for (const QChar c : text) {
664 const QChar::Script script = c.script();
665
666 if (script == QChar::Script_Common || script == QChar::Script_Inherited) {
667 continue;
668 }
669
670 if (!c.isLetter()) {
671 continue;
672 }
673
674 scriptCounts[script]++;
675 totalCount++;
676 }
677
678 QList<QChar::Script> relevantScripts;
679
680 if (totalCount == 0) {
681 return relevantScripts;
682 }
683
684 if (scriptCounts.size() == 1) {
685 return {scriptCounts.cbegin().key()};
686 }
687
688 for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) {
689 // return run types that used for 40% or more of the string
690 const int scriptCount = it.value();
691 const auto currentScript = it.key();
692 if (scriptCount * 100 / totalCount >= 40) {
693 relevantScripts << currentScript;
694 // always return basic latin if found more than 15%.
695 } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) {
696 relevantScripts << currentScript;
697 }
698 }
699
700 return relevantScripts;
701 }
702
identify(const QString & sample,const QList<QChar::Script> & scripts)703 QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts)
704 {
705 if (sample.size() < MIN_LENGTH) {
706 return QStringList();
707 }
708
709 QStringList guesses;
710 for (const QChar::Script script : scripts) {
711 guesses.append(guessFromTrigrams(sample, s_scriptLanguages.values(script)));
712 }
713
714 return guesses;
715 }
716
guessFromTrigrams(const QString & sample,const QStringList & languages)717 QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages)
718 {
719 QStringList ret;
720
721 const QVector<QString> sampleTrigrams = createOrderedModel(sample);
722
723 // Sort by score
724 QMultiMap<int, QString> scores;
725 for (const QString &language : languages) {
726 if (s_knownModels.contains(language)) {
727 scores.insert(distance(sampleTrigrams, s_knownModels[language]), language);
728 }
729 }
730
731 // Skip if either no results or best result is completely unknown (distance >= maxdistance)
732 if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) {
733 qCDebug(SONNET_LOG_CORE) << "No scores for" << sample;
734 return ret;
735 }
736
737 int counter = 0;
738 double confidence = 0;
739 QMapIterator<int, QString> it(scores);
740 it.next();
741
742 QString prevItem = it.value();
743 int prevScore = it.key();
744
745 while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) {
746 it.next();
747 counter++;
748 confidence += (it.key() - prevScore) / (double)it.key();
749 ret += prevItem;
750 prevItem = it.value();
751 prevScore = it.key();
752 }
753 if (counter < m_maxItems && confidence < m_minConfidence) {
754 ret += prevItem;
755 }
756
757 return ret;
758 }
759
createOrderedModel(const QString & content)760 QVector<QString> GuessLanguagePrivate::createOrderedModel(const QString &content)
761 {
762 QHash<QString, int> trigramCounts;
763
764 // collect trigrams
765 trigramCounts.reserve(content.size() - 2);
766 for (int i = 0; i < (content.size() - 2); ++i) {
767 QString tri = content.mid(i, 3).toLower();
768 trigramCounts[tri]++;
769 }
770
771 // invert the map <freq, trigram>
772 QVector<QPair<int, QString>> trigramFrequencyList;
773 trigramFrequencyList.reserve(trigramCounts.size());
774
775 auto it = trigramCounts.constBegin();
776 for (; it != trigramCounts.constEnd(); ++it) {
777 const QChar *data = it.key().constData();
778 bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace()));
779
780 if (!hasTwoSpaces) {
781 const int freq = it.value();
782 const QString &trigram = it.key();
783 trigramFrequencyList.append({freq, trigram});
784 }
785 }
786
787 // sort descending by frequency
788 std::sort(trigramFrequencyList.begin(), trigramFrequencyList.end(), [](const QPair<int, QString> &a, const QPair<int, QString> &b) {
789 return a.first > b.first;
790 });
791
792 QVector<QString> orderedTrigrams;
793 orderedTrigrams.reserve(trigramFrequencyList.size());
794 for (const auto &tri : std::as_const(trigramFrequencyList)) {
795 orderedTrigrams.append(tri.second);
796 }
797
798 return orderedTrigrams;
799 }
800
distance(const QVector<QString> & model,const QHash<QString,int> & knownModel)801 int GuessLanguagePrivate::distance(const QVector<QString> &model, const QHash<QString, int> &knownModel)
802 {
803 int counter = -1;
804 int dist = 0;
805
806 for (const QString &trigram : model) {
807 const int val = knownModel.value(trigram, -1);
808 if (val != -1) {
809 dist += qAbs(++counter - val);
810 } else {
811 dist += MAXGRAMS;
812 }
813
814 if (counter == (MAXGRAMS - 1)) {
815 break;
816 }
817 }
818
819 return dist;
820 }
821
guessFromDictionaries(const QString & sentence,const QStringList & candidates)822 QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates)
823 {
824 // Try to see how many languages we can get spell checking for
825 QList<QSharedPointer<SpellerPlugin>> spellers;
826 for (const QString &lang : candidates) {
827 if (!Loader::openLoader()->languages().contains(lang)) {
828 qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang;
829 continue;
830 }
831 QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(lang);
832 if (!plugin.isNull()) {
833 spellers.append(plugin);
834 }
835 }
836
837 // If there's no spell checkers, give up
838 if (spellers.isEmpty()) {
839 return QString();
840 }
841
842 QMap<QString, int> correctHits;
843
844 WordTokenizer tokenizer(sentence);
845 while (tokenizer.hasNext()) {
846 Token word = tokenizer.next();
847 if (!tokenizer.isSpellcheckable()) {
848 continue;
849 }
850
851 for (int i = 0; i < spellers.count(); ++i) {
852 if (spellers[i]->isCorrect(word.toString())) {
853 correctHits[spellers[i]->language()]++;
854 }
855 }
856 }
857
858 if (correctHits.isEmpty()) {
859 return QString();
860 }
861
862 QMap<QString, int>::const_iterator max = correctHits.constBegin();
863 for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) {
864 if (itr.value() > max.value()) {
865 max = itr;
866 }
867 }
868 return max.key();
869 }
870 }
871