1 /* This file is part of the KDE libraries 2 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> 3 4 SPDX-License-Identifier: LGPL-2.0-or-later 5 */ 6 7 #ifndef GUESSLANGUAGE_H 8 #define GUESSLANGUAGE_H 9 10 #include <QString> 11 #include <QStringList> 12 13 #include "sonnetcore_export.h" 14 15 namespace Sonnet 16 { 17 // Amount of trigrams in each file 18 static const int MAXGRAMS = 300; 19 20 class GuessLanguagePrivate; 21 22 /** 23 * @short GuessLanguage determines the language of a given text. 24 * 25 * GuessLanguage can determine the difference between ~75 languages for a given string. It is 26 * based off a Perl script originally written by Maciej Ceglowski <maciej@ceglowski.com> 27 * called Languid. His script used a 2 part heuristic to determine language. First the text 28 * is checked for the scripts it contains, then for each set of languages using those 29 * scripts a n-gram frequency model of a given language is compared to a model of the text. 30 * The most similar language model is assumed to be the language. If no language is found 31 * an empty string is returned. 32 * 33 * 34 * @author Jacob Rideout <kde@jacobrideout.net> 35 * @since 4.3 36 */ 37 class SONNETCORE_EXPORT GuessLanguage 38 { 39 public: 40 /** Constructor 41 * Creates a new GuessLanguage instance. If @p text is specified, 42 * it sets the text to be checked. 43 * @param text the text that is to be checked 44 */ 45 GuessLanguage(); 46 47 /** Destructor 48 */ 49 ~GuessLanguage(); 50 51 GuessLanguage(const GuessLanguage &) = delete; 52 GuessLanguage &operator=(const GuessLanguage &) = delete; 53 54 /** 55 * Sets limits to number of languages returned by identify(). The confidence for each language is computed 56 * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get 57 * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly 58 * as after call to setLimits(1,0). 59 * @param maxItems The list returned by identify() will never have more than maxItems item 60 * @param minConfidence The list will have only enough items for their summary confidence equal 61 * or exceed minConfidence. 62 */ 63 void setLimits(int maxItems, double minConfidence); 64 65 /** 66 * Returns the 2 digit ISO 639-1 code for the language of the currently 67 * set text and. Three digits are returned only in the case where a 2 digit 68 * code does not exist. If @p text isn't empty, set the text to checked. 69 * @param text to be identified 70 * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means 71 * it is impossible to determine language with confidence required by setLimits 72 */ 73 QString identify(const QString &text, const QStringList &suggestions = QStringList()) const; 74 75 private: 76 GuessLanguagePrivate *const d; 77 }; 78 } 79 80 #endif 81