1 /*  This file is part of the KDE libraries
2     SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
3 
4     SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #ifndef GUESSLANGUAGE_H
8 #define GUESSLANGUAGE_H
9 
10 #include <QString>
11 #include <QStringList>
12 
13 #include "sonnetcore_export.h"
14 
15 namespace Sonnet
16 {
17 // Amount of trigrams in each file
18 static const int MAXGRAMS = 300;
19 
20 class GuessLanguagePrivate;
21 
22 /**
23  * @short GuessLanguage determines the language of a given text.
24  *
25  * GuessLanguage can determine the difference between ~75 languages for a given string. It is
26  * based off a Perl script originally written by Maciej Ceglowski <maciej@ceglowski.com>
27  * called Languid. His script used a 2 part heuristic to determine language. First the text
28  * is checked for the scripts it contains, then for each set of languages using those
29  * scripts a n-gram frequency model of a given language is compared to a model of the text.
30  * The most similar language model is assumed to be the language. If no language is found
31  * an empty string is returned.
32  *
33  *
34  * @author Jacob Rideout <kde@jacobrideout.net>
35  * @since 4.3
36  */
37 class SONNETCORE_EXPORT GuessLanguage
38 {
39 public:
40     /** Constructor
41      * Creates a new GuessLanguage instance. If @p text is specified,
42      * it sets the text to be checked.
43      * @param text the text that is to be checked
44      */
45     GuessLanguage();
46 
47     /** Destructor
48      */
49     ~GuessLanguage();
50 
51     GuessLanguage(const GuessLanguage &) = delete;
52     GuessLanguage &operator=(const GuessLanguage &) = delete;
53 
54     /**
55      * Sets limits to number of languages returned by identify(). The confidence for each language is computed
56      * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get
57      * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly
58      * as after call to setLimits(1,0).
59      * @param maxItems The list returned by identify() will never have more than maxItems item
60      * @param minConfidence The list will have only enough items for their summary confidence equal
61      * or exceed minConfidence.
62      */
63     void setLimits(int maxItems, double minConfidence);
64 
65     /**
66      * Returns the 2 digit ISO 639-1 code for the language of the currently
67      * set text and. Three digits are returned only in the case where a 2 digit
68      * code does not exist. If @p text isn't empty, set the text to checked.
69      * @param text to be identified
70      * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means
71      * it is impossible to determine language with confidence required by setLimits
72      */
73     QString identify(const QString &text, const QStringList &suggestions = QStringList()) const;
74 
75 private:
76     GuessLanguagePrivate *const d;
77 };
78 }
79 
80 #endif
81