1 // Copyright 2013 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // 16 // Author: dsites@google.com (Dick Sites) 17 // 18 19 // NOTE: 20 // Baybayin (ancient script of the Philippines) is detected as TAGALOG. 21 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE. 22 // HAITIAN_CREOLE is detected as such. 23 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly) 24 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE. 25 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN. 26 // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian. 27 // MONTENEGRIN is not detected as such, but likely scores as Serbian. 28 // CROATIAN is detected in the Latin script 29 // SERBIAN is detected in the Cyrililc and Latin scripts 30 // Zhuang is detected in the Latin script only. 31 // 32 // The languages X_PIG_LATIN and X_KLINGON are detected in the 33 // extended calls ExtDetectLanguageSummary(). 34 // 35 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure 36 // is high enough. This happens with non-text input such as the bytes of a 37 // JPEG, and also with text in languages outside training set. 38 // 39 // The following languages are to be detected in multiple scripts: 40 // AZERBAIJANI (Latin, Cyrillic*, Arabic*) 41 // BURMESE (Latin, Myanmar) 42 // HAUSA (Latin, Arabic) 43 // KASHMIRI (Arabic, Devanagari) 44 // KAZAKH (Latin, Cyrillic, Arabic) 45 // KURDISH (Latin*, Arabic) 46 // KYRGYZ (Cyrillic, Arabic) 47 // LIMBU (Devanagari, Limbu) 48 // MONGOLIAN (Cyrillic, Mongolian) 49 // SANSKRIT (Latin, Devanagari) 50 // SINDHI (Arabic, Devanagari) 51 // TAGALOG (Latin, Tagalog) 52 // TAJIK (Cyrillic, Arabic*) 53 // TATAR (Latin, Cyrillic, Arabic) 54 // TURKMEN (Latin, Cyrillic, Arabic) 55 // UIGHUR (Latin, Cyrillic, Arabic) 56 // UZBEK (Latin, Cyrillic, Arabic) 57 // 58 // * Due to a shortage of training text, AZERBAIJANI is not currently detected 59 // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in 60 // Arabic script. 61 // 62 63 #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ 64 #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ 65 66 #include <vector> 67 #include "../internal/lang_script.h" // For Language 68 69 namespace CLD2 { 70 71 // Scan interchange-valid UTF-8 bytes and detect most likely language, 72 // or set of languages. 73 // 74 // Design goals: 75 // Skip over big stretches of HTML tags 76 // Able to return ranges of different languages 77 // Relatively small tables and relatively fast processing 78 // Thread safe 79 // 80 // For HTML documents, tags are skipped, along with <script> ... </script> 81 // and <style> ... </style> sequences, and entities are expanded. 82 // 83 // We distinguish between bytes of the raw input buffer and bytes of non-tag 84 // text letters. Since tags can be over 50% of the bytes of an HTML Page, 85 // and are nearly all seven-bit ASCII English, we prefer to distinguish 86 // language mixture fractions based on just the non-tag text. 87 // 88 // Inputs: text and text_length 89 // Code skips HTML tags and expands HTML entities, unless 90 // is_plain_text is true 91 // Outputs: 92 // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE 93 // percent3 is an array of the text percentages 0..100 of the top 3 languages 94 // text_bytes is the amount of non-tag/letters-only text found 95 // is_reliable set true if the returned Language is some amount more 96 // probable then the second-best Language. Calculation is a complex function 97 // of the length of the text and the different-script runs of text. 98 // Return value: the most likely Language for the majority of the input text 99 // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text 100 // defaults to ENGLISH. 101 // 102 // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for 103 // backwards compatibility with a different detector. 104 // 105 // The third version may return UNKNOWN_LANGUAGE, and also returns extended 106 // language codes from lang_script.h 107 // 108 109 110 // Instead of individual arguments, pass in hints as an initialized struct 111 // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known. 112 // 113 // Pass in hints whenever possible; doing so improves detection accuracy. The 114 // set of passed-in hints are all information that is external to the text 115 // itself. 116 // 117 // The content_language_hint is intended to come from an HTTP header 118 // Content-Language: field, the tld_hint from the hostname of a URL, the 119 // encoding-hint from an encoding detector applied to the input 120 // document, and the language hint from any other context you might have. 121 // The lang= tags inside an HTML document will be picked up as hints 122 // by code within the compact language detector. 123 124 typedef struct { 125 const char* content_language_hint; // "mi,en" boosts Maori and English 126 const char* tld_hint; // "id" boosts Indonesian 127 int encoding_hint; // SJS boosts Japanese 128 Language language_hint; // ITALIAN boosts it 129 } CLDHints; 130 131 static const int kMaxResultChunkBytes = 65535; 132 133 // For returning a vector of per-language pieces of the input buffer 134 // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE 135 typedef struct { 136 int offset; // Starting byte offset in original buffer 137 uint16 bytes; // Number of bytes in chunk 138 uint16 lang1; // Top lang, as full Language. Apply 139 // static_cast<Language>() to this short value. 140 } ResultChunk; 141 typedef std::vector<ResultChunk> ResultChunkVector; 142 143 144 // Scan interchange-valid UTF-8 bytes and detect most likely language 145 Language DetectLanguage( 146 const char* buffer, 147 int buffer_length, 148 bool is_plain_text, 149 bool* is_reliable); 150 151 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 152 // language3[0] is usually also the return value 153 Language DetectLanguageSummary( 154 const char* buffer, 155 int buffer_length, 156 bool is_plain_text, 157 Language* language3, 158 int* percent3, 159 int* text_bytes, 160 bool* is_reliable); 161 162 // Same as above, with hints supplied 163 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 164 // language3[0] is usually also the return value 165 Language DetectLanguageSummary( 166 const char* buffer, 167 int buffer_length, 168 bool is_plain_text, 169 const char* tld_hint, // "id" boosts Indonesian 170 int encoding_hint, // SJS boosts Japanese 171 Language language_hint, // ITALIAN boosts it 172 Language* language3, 173 int* percent3, 174 int* text_bytes, 175 bool* is_reliable); 176 177 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 178 // languages. 179 // 180 // Extended languages are additional interface languages and Unicode 181 // single-language scripts, from lang_script.h 182 // 183 // language3[0] is usually also the return value 184 Language ExtDetectLanguageSummary( 185 const char* buffer, 186 int buffer_length, 187 bool is_plain_text, 188 Language* language3, 189 int* percent3, 190 int* text_bytes, 191 bool* is_reliable); 192 193 // Same as above, with hints supplied 194 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 195 // languages. 196 // 197 // Extended languages are additional Google interface languages and Unicode 198 // single-language scripts, from lang_script.h 199 // 200 // language3[0] is usually also the return value 201 Language ExtDetectLanguageSummary( 202 const char* buffer, 203 int buffer_length, 204 bool is_plain_text, 205 const char* tld_hint, // "id" boosts Indonesian 206 int encoding_hint, // SJS boosts Japanese 207 Language language_hint, // ITALIAN boosts it 208 Language* language3, 209 int* percent3, 210 int* text_bytes, 211 bool* is_reliable); 212 213 // Same as above, and also returns 3 internal language scores as a ratio to 214 // normal score for real text in that language. Scores close to 1.0 indicate 215 // normal text, while scores far away from 1.0 indicate badly-skewed text or 216 // gibberish 217 // 218 Language ExtDetectLanguageSummary( 219 const char* buffer, 220 int buffer_length, 221 bool is_plain_text, 222 const char* tld_hint, // "id" boosts Indonesian 223 int encoding_hint, // SJS boosts Japanese 224 Language language_hint, // ITALIAN boosts it 225 Language* language3, 226 int* percent3, 227 double* normalized_score3, 228 int* text_bytes, 229 bool* is_reliable); 230 231 232 // Use this one. 233 // Hints are collected into a struct. 234 // Flags are passed in (normally zero). 235 // 236 // Also returns 3 internal language scores as a ratio to 237 // normal score for real text in that language. Scores close to 1.0 indicate 238 // normal text, while scores far away from 1.0 indicate badly-skewed text or 239 // gibberish 240 // 241 // Returns a vector of chunks in different languages, so that caller may 242 // spell-check, translate, or otherwaise process different parts of the input 243 // buffer in language-dependant ways. 244 // 245 Language ExtDetectLanguageSummary( 246 const char* buffer, 247 int buffer_length, 248 bool is_plain_text, 249 const CLDHints* cld_hints, 250 int flags, 251 Language* language3, 252 int* percent3, 253 double* normalized_score3, 254 ResultChunkVector* resultchunkvector, 255 int* text_bytes, 256 bool* is_reliable); 257 258 // Return version text string 259 // String is "code_version - data_build_date" 260 const char* DetectLanguageVersion(); 261 262 263 // Public use flags, debug output controls 264 static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads 265 static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr 266 static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML 267 static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr 268 static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr 269 static const int kCLDFlagEcho = 0x2000; // Echo input => stderr 270 271 272 /*** 273 274 Flag meanings: 275 kCLDFlagScoreAsQuads 276 Normally, several languages are detected solely by their Unicode script. 277 Combined with appropritate lookup tables, this flag forces them instead 278 to be detected via quadgrams. This can be a useful refinement when looking 279 for meaningful text in these languages, instead of just character sets. 280 The default tables do not support this use. 281 kCLDFlagHtml 282 For each detection call, write an HTML file to stderr, showing the text 283 chunks and their detected languages. 284 kCLDFlagCr 285 In that HTML file, force a new line for each chunk. 286 kCLDFlagVerbose 287 In that HTML file, show every lookup entry. 288 kCLDFlagQuiet 289 In that HTML file, suppress most of the output detail. 290 kCLDFlagEcho 291 Echo every input buffer to stderr. 292 ***/ 293 294 // Debug output: Print the resultchunkvector to file f 295 void DumpResultChunkVector(FILE* f, const char* src, 296 ResultChunkVector* resultchunkvector); 297 298 #ifdef CLD2_DYNAMIC_MODE 299 300 // If compiled with dynamic mode, load data from the specified file location. 301 // If other data has already been loaded, it is discarded and the data is read 302 // in from the specified file location again (even if the file has not changed). 303 // WARNING: Before calling this method, language detection will always fail 304 // and will always return the unknown language. 305 void loadData(const char* fileName); 306 307 // If compiled with dynamic mode, unload the previously-loaded data. 308 // WARNING: After calling this method, language detection will no longer work 309 // and will always return the unknown language. 310 void unloadData(); 311 312 // Returns true if and only if data has been loaded via a call to loadData(...) 313 // and has not been subsequently unladed via a call to unloadDate(). 314 bool isDataLoaded(); 315 316 #endif // #ifdef CLD2_DYNAMIC_MODE 317 318 }; // End namespace CLD2 319 320 #endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ 321