1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // Author: dsites@google.com (Dick Sites)
17 // Updated 2014.01 for dual table lookup
18 //
19
20 #include <stdio.h>
21 #include <string.h>
22 #include <string>
23 #include <vector>
24
25 #include "cldutil.h"
26 #include "debug.h"
27 #include "integral_types.h"
28 #include "lang_script.h"
29 #include "utf8statetable.h"
30
31 #ifdef CLD2_DYNAMIC_MODE
32 #include "cld2_dynamic_data.h"
33 #include "cld2_dynamic_data_loader.h"
34 #endif
35 #include "cld2tablesummary.h"
36 #include "compact_lang_det_impl.h"
37 #include "compact_lang_det_hint_code.h"
38 #include "getonescriptspan.h"
39 #include "tote.h"
40
41
42 namespace CLD2 {
43
44 using namespace std;
45
46 // Linker supplies the right tables, From files
47 // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc
48 // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc
49 // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc
50 // cld2_generated_distinctocta*.cc
51 // cld_generated_score_quad_octa_1024_256.cc
52
53 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table
54 // sizes that are 1/3/5 times a power of two, instead of just powers of two.
55 // Gives more flexibility of total footprint for CLD2.
56
57 extern const int kLanguageToPLangSize;
58 extern const int kCloseSetSize;
59
60 extern const UTF8PropObj cld_generated_CjkUni_obj;
61 extern const CLD2TableSummary kCjkCompat_obj;
62 extern const CLD2TableSummary kCjkDeltaBi_obj;
63 extern const CLD2TableSummary kDistinctBiTable_obj;
64 extern const CLD2TableSummary kQuad_obj;
65 extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables
66 extern const CLD2TableSummary kDeltaOcta_obj;
67 extern const CLD2TableSummary kDistinctOcta_obj;
68 extern const short kAvgDeltaOctaScore[];
69
70 #ifdef CLD2_DYNAMIC_MODE
71 // CLD2_DYNAMIC_MODE is defined:
72 // Data will be read from an mmap opened at runtime.
73 static ScoringTables kScoringtables = {
74 NULL, //&cld_generated_CjkUni_obj,
75 NULL, //&kCjkCompat_obj,
76 NULL, //&kCjkDeltaBi_obj,
77 NULL, //&kDistinctBiTable_obj,
78 NULL, //&kQuad_obj,
79 NULL, //&kQuad_obj2,
80 NULL, //&kDeltaOcta_obj,
81 NULL, //&kDistinctOcta_obj,
82 NULL, //kAvgDeltaOctaScore,
83 };
84 static bool dynamicDataLoaded = false;
85 static ScoringTables* dynamicTables = NULL;
86 static void* mmapAddress = NULL;
87 static int mmapLength = 0;
88
isDataLoaded()89 bool isDataLoaded() { return dynamicDataLoaded; }
90
loadData(const char * fileName)91 void loadData(const char* fileName) {
92 if (isDataLoaded()) {
93 unloadData();
94 }
95 dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
96 kScoringtables = *dynamicTables;
97 dynamicDataLoaded = true;
98 };
99
unloadData()100 void unloadData() {
101 if (!dynamicDataLoaded) return;
102 dynamicDataLoaded = false;
103 // unloading will null all the pointers out.
104 CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength);
105 }
106 #else
107 // This initializes kScoringtables.quadgram_obj etc.
108 static const ScoringTables kScoringtables = {
109 &cld_generated_CjkUni_obj,
110 &kCjkCompat_obj,
111 &kCjkDeltaBi_obj,
112 &kDistinctBiTable_obj,
113
114 &kQuad_obj,
115 &kQuad_obj2, // Dual lookup tables
116 &kDeltaOcta_obj,
117 &kDistinctOcta_obj,
118
119 kAvgDeltaOctaScore,
120 };
121 #endif // #ifdef CLD2_DYNAMIC_MODE
122
123
124 static const bool FLAGS_cld_no_minimum_bytes = false;
125 static const bool FLAGS_cld_forcewords = true;
126 static const bool FLAGS_cld_showme = false;
127 static const bool FLAGS_cld_echotext = true;
128 static const int32 FLAGS_cld_textlimit = 160;
129 static const int32 FLAGS_cld_smoothwidth = 20;
130 static const bool FLAGS_cld_2011_hints = true;
131 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;
132
133 static const bool FLAGS_dbgscore = false;
134
135
136 static const int kLangHintInitial = 12; // Boost language by N initially
137 static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
138
139 static const int kShortSpanThresh = 32; // Bytes
140 static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
141
142 static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
143 // after this many text bytes
144 static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
145 static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
146 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
147
148 static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
149 static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
150 static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
151
152 static const int kMaxSpaceScan = 32; // Bytes
153
154 static const int kGoodLang1Percent = 70;
155 static const int kGoodLang1and2Percent = 93;
156 static const int kShortTextThresh = 256; // Bytes
157
158 static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
159 static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
160
161 static const int kDefaultWordSpan = 256; // Scan at least this many initial
162 // bytes with word scoring
163 static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
164
165 static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
166
167 static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
168 // cheap compressor
169
170 static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
171 static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
172 static const int kGoodFirstMinPercent = 26; // <this => UNK
173 static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
174 static const int kIgnoreMaxPercent = 20; // >this => unreli
175 static const int kKeepMinPercent = 2; // <this => unreli
176
177
178
179 // Statistically closest language, based on quadgram table
180 // Those that are far from other languges map to UNKNOWN_LANGUAGE
181 // Subscripted by Language
182 //
183 // From lang_correlation.txt and hand-edits
184 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
185 // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
186 // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
187 //
188 static const int kMinCorrPercent = 24; // Pick off how close you want
189 // 24 catches PERSIAN <== ARABIC
190 // but not SPANISH <== PORTUGESE
191 static Language Unknown = UNKNOWN_LANGUAGE;
192
193 // Suspect idea
194 // Subscripted by Language
195 static const Language kClosestAltLanguage[] = {
196 (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
197 (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
198 (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
199 (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
200 (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
201 (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
202 (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
203 (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
204 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
205 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
206 (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
207 ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
208 (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
209 (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
210 (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
211 (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
212 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
213 (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
214 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
215 (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
216 ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
217 ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
218 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
219 ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
220 (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
221 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
222 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
223 (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
224 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
225 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
226 (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
227 (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
228 ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
229 (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
230 (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
231 (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
232 (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
233 (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
234 (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
235 ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
236 (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
237 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
238 ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
239 ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
240 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
241 ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
242 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
243 (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
244 (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
245 (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
246 (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
247 (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
248 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
249 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
250 (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
251 (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
252 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
253 ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
254 ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
255 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
256 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
257 (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
258 ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
259 (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
260 (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
261 ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
262 ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
263 (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
264 (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
265 // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
266 (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
267 (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
268 (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
269 (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
270 ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
271 (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
272 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
273 ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
274 (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
275 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
276 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
277 (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
278 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
279 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
280 (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
281 (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
282 ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
283 (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
284 ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
285 ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
286 ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
287 ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
288 (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
289 (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
290 (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
291 ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
292 (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
293 ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
294 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
295 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
296 ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
297 (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
298 (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
299 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
300 ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
301 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
302 (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
303 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
304 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
305 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
306 ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
307 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
308 (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
309 (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
310 ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
311 ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
312 ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
313 (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
314 (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
315 ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
316 ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
317 (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
318 (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
319 (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
320 ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
321 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
322 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
323 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
324 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
325 ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
326 ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
327 ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
328 ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
329 ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
330 (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
331 ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
332 (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
333 ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
334 ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
335 ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
336 ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
337 ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
338 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
339 ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
340 (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
341 ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
342 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
343 (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
344 (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
345 (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
346 ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
347 (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
348 (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
349 ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
350 (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
351 ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
352 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
353 ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
354 (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
355 (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
356 ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
357 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
358
359 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN
360 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO
361 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE
362 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN
363 };
364
365 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
366 // kClosestAltLanguage_has_incorrect_size);
367
368
FlagFinish(int flags)369 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
FlagSqueeze(int flags)370 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
FlagRepeats(int flags)371 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
FlagTop40(int flags)372 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
FlagShort(int flags)373 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
FlagHint(int flags)374 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
FlagUseWords(int flags)375 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
376
377
378 // Defines Top40 packed languages
379
380 // Google top 40 languages
381 //
382 // Tier 0/1 Language enum list (16)
383 // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS
384 // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
385 // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
386 // ARABIC,
387 //
388 // Tier 2 Language enum list (22)
389 // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
390 // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
391 // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
392 // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
393 // UKRAINIAN, HINDI,
394 //
395 // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
396 //
397 // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
398
399
DemoteNotTop40(Tote * chunk_tote,uint16 psplus_one)400 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {
401 // REVISIT
402 }
403
PrintText(FILE * f,Language cur_lang,const string & temp)404 void PrintText(FILE* f, Language cur_lang, const string& temp) {
405 if (temp.size() == 0) {return;}
406 fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());
407 }
408
409
410 //------------------------------------------------------------------------------
411 // For --cld_html debugging output. Not thread safe
412 //------------------------------------------------------------------------------
413 static Language prior_lang = UNKNOWN_LANGUAGE;
414 static bool prior_unreliable = false;
415
416 //------------------------------------------------------------------------------
417 // End For --cld_html debugging output
418 //------------------------------------------------------------------------------
419
420
421 // Backscan to word boundary, returning how many bytes n to go back
422 // so that src - n is non-space ans src - n - 1 is space.
423 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
BackscanToSpace(const char * src,int limit)424 int BackscanToSpace(const char* src, int limit) {
425 int n = 0;
426 limit = minint(limit, kMaxSpaceScan);
427 while (n < limit) {
428 if (src[-n - 1] == ' ') {return n;} // We are at _X
429 ++n;
430 }
431 n = 0;
432 while (n < limit) {
433 if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin
434 ++n;
435 }
436 return 0;
437 }
438
439 // Forwardscan to word boundary, returning how many bytes n to go forward
440 // so that src + n is non-space ans src + n - 1 is space.
441 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
ForwardscanToSpace(const char * src,int limit)442 int ForwardscanToSpace(const char* src, int limit) {
443 int n = 0;
444 limit = minint(limit, kMaxSpaceScan);
445 while (n < limit) {
446 if (src[n] == ' ') {return n + 1;} // We are at _X
447 ++n;
448 }
449 n = 0;
450 while (n < limit) {
451 if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin
452 ++n;
453 }
454 return 0;
455 }
456
457
458 // This uses a cheap predictor to get a measure of compression, and
459 // hence a measure of repetitiveness. It works on complete UTF-8 characters
460 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
461 // all the time when done with a byte-based count. Sigh.
462 //
463 // To allow running prediction across multiple chunks, caller passes in current
464 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
465 //
466 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
467 // each correctly-predicted character.
468 //
469 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
470 //
471
472 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
473
CountPredictedBytes(const char * isrc,int src_len,int * hash,int * tbl)474 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {
475 int p_count = 0;
476 const uint8* src = reinterpret_cast<const uint8*>(isrc);
477 const uint8* srclimit = src + src_len;
478 int local_hash = *hash;
479
480 while (src < srclimit) {
481 int c = src[0];
482 int incr = 1;
483
484 // Pick up one char and length
485 if (c < 0xc0) {
486 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
487 // Do nothing more
488 } else if ((c & 0xe0) == 0xc0) {
489 // Two-byte
490 c = (c << 8) | src[1];
491 incr = 2;
492 } else if ((c & 0xf0) == 0xe0) {
493 // Three-byte
494 c = (c << 16) | (src[1] << 8) | src[2];
495 incr = 3;
496 } else {
497 // Four-byte
498 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
499 incr = 4;
500 }
501 src += incr;
502
503 int p = tbl[local_hash]; // Prediction
504 tbl[local_hash] = c; // Update prediction
505 if (c == p) {
506 p_count += incr; // Count bytes of good predictions
507 }
508
509 local_hash = ((local_hash << 4) ^ c) & 0xfff;
510 }
511 *hash = local_hash;
512 return p_count;
513 }
514
515
516
517 // Counts number of spaces; a little faster than one-at-a-time
518 // Doesn't count odd bytes at end
CountSpaces4(const char * src,int src_len)519 int CountSpaces4(const char* src, int src_len) {
520 int s_count = 0;
521 for (int i = 0; i < (src_len & ~3); i += 4) {
522 s_count += (src[i] == ' ');
523 s_count += (src[i+1] == ' ');
524 s_count += (src[i+2] == ' ');
525 s_count += (src[i+3] == ' ');
526 }
527 return s_count;
528 }
529
530
531 // Remove words of text that have more than half their letters predicted
532 // correctly by our cheap predictor, moving the remaining words in-place
533 // to the front of the input buffer.
534 //
535 // To allow running prediction across multiple chunks, caller passes in current
536 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
537 //
538 // Return the new, possibly-shorter length
539 //
540 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
541 // if input does
542 //
CheapRepWordsInplace(char * isrc,int src_len,int * hash,int * tbl)543 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
544 const uint8* src = reinterpret_cast<const uint8*>(isrc);
545 const uint8* srclimit = src + src_len;
546 char* dst = isrc;
547 int local_hash = *hash;
548 char* word_dst = dst; // Start of next word
549 int good_predict_bytes = 0;
550 int word_length_bytes = 0;
551
552 while (src < srclimit) {
553 int c = src[0];
554 int incr = 1;
555 *dst++ = c;
556
557 if (c == ' ') {
558 if ((good_predict_bytes * 2) > word_length_bytes) {
559 // Word is well-predicted: backup to start of this word
560 dst = word_dst;
561 if (FLAGS_cld_showme) {
562 // Mark the deletion point with period
563 // Don't repeat multiple periods
564 // Cannot mark with more bytes or may overwrite unseen input
565 if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
566 *dst++ = '.';
567 *dst++ = ' ';
568 }
569 }
570 }
571 word_dst = dst; // Start of next word
572 good_predict_bytes = 0;
573 word_length_bytes = 0;
574 }
575
576 // Pick up one char and length
577 if (c < 0xc0) {
578 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
579 // Do nothing more
580 } else if ((c & 0xe0) == 0xc0) {
581 // Two-byte
582 *dst++ = src[1];
583 c = (c << 8) | src[1];
584 incr = 2;
585 } else if ((c & 0xf0) == 0xe0) {
586 // Three-byte
587 *dst++ = src[1];
588 *dst++ = src[2];
589 c = (c << 16) | (src[1] << 8) | src[2];
590 incr = 3;
591 } else {
592 // Four-byte
593 *dst++ = src[1];
594 *dst++ = src[2];
595 *dst++ = src[3];
596 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
597 incr = 4;
598 }
599 src += incr;
600 word_length_bytes += incr;
601
602 int p = tbl[local_hash]; // Prediction
603 tbl[local_hash] = c; // Update prediction
604 if (c == p) {
605 good_predict_bytes += incr; // Count good predictions
606 }
607
608 local_hash = ((local_hash << 4) ^ c) & 0xfff;
609 }
610
611 *hash = local_hash;
612
613 if ((dst - isrc) < (src_len - 3)) {
614 // Pad and make last char clean UTF-8 by putting following spaces
615 dst[0] = ' ';
616 dst[1] = ' ';
617 dst[2] = ' ';
618 dst[3] = '\0';
619 } else if ((dst - isrc) < src_len) {
620 // Make last char clean UTF-8 by putting following space off the end
621 dst[0] = ' ';
622 }
623
624 return static_cast<int>(dst - isrc);
625 }
626
627
628 // This alternate form overwrites redundant words, thus avoiding corrupting the
629 // backmap for generate a vector of original-text ranges.
CheapRepWordsInplaceOverwrite(char * isrc,int src_len,int * hash,int * tbl)630 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
631 const uint8* src = reinterpret_cast<const uint8*>(isrc);
632 const uint8* srclimit = src + src_len;
633 char* dst = isrc;
634 int local_hash = *hash;
635 char* word_dst = dst; // Start of next word
636 int good_predict_bytes = 0;
637 int word_length_bytes = 0;
638
639 while (src < srclimit) {
640 int c = src[0];
641 int incr = 1;
642 *dst++ = c;
643
644 if (c == ' ') {
645 if ((good_predict_bytes * 2) > word_length_bytes) {
646 // Word [word_dst..dst-1) is well-predicted: overwrite
647 for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}
648 }
649 word_dst = dst; // Start of next word
650 good_predict_bytes = 0;
651 word_length_bytes = 0;
652 }
653
654 // Pick up one char and length
655 if (c < 0xc0) {
656 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
657 // Do nothing more
658 } else if ((c & 0xe0) == 0xc0) {
659 // Two-byte
660 *dst++ = src[1];
661 c = (c << 8) | src[1];
662 incr = 2;
663 } else if ((c & 0xf0) == 0xe0) {
664 // Three-byte
665 *dst++ = src[1];
666 *dst++ = src[2];
667 c = (c << 16) | (src[1] << 8) | src[2];
668 incr = 3;
669 } else {
670 // Four-byte
671 *dst++ = src[1];
672 *dst++ = src[2];
673 *dst++ = src[3];
674 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
675 incr = 4;
676 }
677 src += incr;
678 word_length_bytes += incr;
679
680 int p = tbl[local_hash]; // Prediction
681 tbl[local_hash] = c; // Update prediction
682 if (c == p) {
683 good_predict_bytes += incr; // Count good predictions
684 }
685
686 local_hash = ((local_hash << 4) ^ c) & 0xfff;
687 }
688
689 *hash = local_hash;
690
691 if ((dst - isrc) < (src_len - 3)) {
692 // Pad and make last char clean UTF-8 by putting following spaces
693 dst[0] = ' ';
694 dst[1] = ' ';
695 dst[2] = ' ';
696 dst[3] = '\0';
697 } else if ((dst - isrc) < src_len) {
698 // Make last char clean UTF-8 by putting following space off the end
699 dst[0] = ' ';
700 }
701
702 return static_cast<int>(dst - isrc);
703 }
704
705
706 // Remove portions of text that have a high density of spaces, or that are
707 // overly repetitive, squeezing the remaining text in-place to the front of the
708 // input buffer.
709 //
710 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
711 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
712 //
713 // Return the new, possibly-shorter length
714 //
715 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
716 // if input does
717 //
CheapSqueezeInplace(char * isrc,int src_len,int ichunksize)718 int CheapSqueezeInplace(char* isrc,
719 int src_len,
720 int ichunksize) {
721 char* src = isrc;
722 char* dst = src;
723 char* srclimit = src + src_len;
724 bool skipping = false;
725
726 int hash = 0;
727 // Allocate local prediction table.
728 int* predict_tbl = new int[kPredictionTableSize];
729 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
730
731 int chunksize = ichunksize;
732 if (chunksize == 0) {chunksize = kChunksizeDefault;}
733 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
734 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
735
736 while (src < srclimit) {
737 int remaining_bytes = srclimit - src;
738 int len = minint(chunksize, remaining_bytes);
739 // Make len land us on a UTF-8 character boundary.
740 // Ah. Also fixes mispredict because we could get out of phase
741 // Loop always terminates at trailing space in buffer
742 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes
743
744 int space_n = CountSpaces4(src, len);
745 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
746 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
747 // Skip the text
748 if (!skipping) {
749 // Keeping-to-skipping transition; do it at a space
750 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
751 dst -= n;
752 if (dst == isrc) {
753 // Force a leading space if the first chunk is deleted
754 *dst++ = ' ';
755 }
756 if (FLAGS_cld_showme) {
757 // Mark the deletion point with black square U+25A0
758 *dst++ = static_cast<unsigned char>(0xe2);
759 *dst++ = static_cast<unsigned char>(0x96);
760 *dst++ = static_cast<unsigned char>(0xa0);
761 *dst++ = ' ';
762 }
763 skipping = true;
764 }
765 } else {
766 // Keep the text
767 if (skipping) {
768 // Skipping-to-keeping transition; do it at a space
769 int n = ForwardscanToSpace(src, len);
770 src += n;
771 remaining_bytes -= n; // Shrink remaining length
772 len -= n;
773 skipping = false;
774 }
775 // "len" can be negative in some cases
776 if (len > 0) {
777 memmove(dst, src, len);
778 dst += len;
779 }
780 }
781 src += len;
782 }
783
784 if ((dst - isrc) < (src_len - 3)) {
785 // Pad and make last char clean UTF-8 by putting following spaces
786 dst[0] = ' ';
787 dst[1] = ' ';
788 dst[2] = ' ';
789 dst[3] = '\0';
790 } else if ((dst - isrc) < src_len) {
791 // Make last char clean UTF-8 by putting following space off the end
792 dst[0] = ' ';
793 }
794
795 // Deallocate local prediction table
796 delete[] predict_tbl;
797 return static_cast<int>(dst - isrc);
798 }
799
800 // This alternate form overwrites redundant words, thus avoiding corrupting the
801 // backmap for generate a vector of original-text ranges.
CheapSqueezeInplaceOverwrite(char * isrc,int src_len,int ichunksize)802 int CheapSqueezeInplaceOverwrite(char* isrc,
803 int src_len,
804 int ichunksize) {
805 char* src = isrc;
806 char* dst = src;
807 char* srclimit = src + src_len;
808 bool skipping = false;
809
810 int hash = 0;
811 // Allocate local prediction table.
812 int* predict_tbl = new int[kPredictionTableSize];
813 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
814
815 int chunksize = ichunksize;
816 if (chunksize == 0) {chunksize = kChunksizeDefault;}
817 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
818 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
819
820 // Always keep first byte (space)
821 ++src;
822 ++dst;
823 while (src < srclimit) {
824 int remaining_bytes = srclimit - src;
825 int len = minint(chunksize, remaining_bytes);
826 // Make len land us on a UTF-8 character boundary.
827 // Ah. Also fixes mispredict because we could get out of phase
828 // Loop always terminates at trailing space in buffer
829 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes
830
831 int space_n = CountSpaces4(src, len);
832 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
833 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
834 // Overwrite the text [dst-n..dst)
835 if (!skipping) {
836 // Keeping-to-skipping transition; do it at a space
837 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
838 // Text [word_dst..dst) is well-predicted: overwrite
839 for (char* p = dst - n; p < dst; ++p) {*p = '.';}
840 skipping = true;
841 }
842 // Overwrite the text [dst..dst+len)
843 for (char* p = dst; p < dst + len; ++p) {*p = '.';}
844 dst[len - 1] = ' '; // Space at end so we can see what is happening
845 } else {
846 // Keep the text
847 if (skipping) {
848 // Skipping-to-keeping transition; do it at a space
849 int n = ForwardscanToSpace(src, len);
850 // Text [dst..dst+n) is well-predicted: overwrite
851 for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}
852 skipping = false;
853 }
854 }
855 dst += len;
856 src += len;
857 }
858
859 if ((dst - isrc) < (src_len - 3)) {
860 // Pad and make last char clean UTF-8 by putting following spaces
861 dst[0] = ' ';
862 dst[1] = ' ';
863 dst[2] = ' ';
864 dst[3] = '\0';
865 } else if ((dst - isrc) < src_len) {
866 // Make last char clean UTF-8 by putting following space off the end
867 dst[0] = ' ';
868 }
869
870 // Deallocate local prediction table
871 delete[] predict_tbl;
872 return static_cast<int>(dst - isrc);
873 }
874
875 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
876 // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
877 // Just CountSpaces is about 340 MB/sec
878 // Byte-only CountPredictedBytes is about 150 MB/sec
879 // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
880 // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
881 // Unjammed byte-only both = 170 MB/sec
882 // Jammed byte-only both = 120 MB/sec
883 // Back to original w/slight updates, 110 MB/sec
884 //
CheapSqueezeTriggerTest(const char * src,int src_len,int testsize)885 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {
886 // Don't trigger at all on short text
887 if (src_len < testsize) {return false;}
888 int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
889 int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
890 int hash = 0;
891 // Allocate local prediction table.
892 int* predict_tbl = new int[kPredictionTableSize];
893 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
894
895 bool retval = false;
896 if ((CountSpaces4(src, testsize) >= space_thresh) ||
897 (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
898 predict_thresh)) {
899 retval = true;
900 }
901 // Deallocate local prediction table
902 delete[] predict_tbl;
903 return retval;
904 }
905
906
907
908
909 // Delete any extended languages from doc_tote
RemoveExtendedLanguages(DocTote * doc_tote)910 void RemoveExtendedLanguages(DocTote* doc_tote) {
911 // Now a nop
912 }
913
914 static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
915
916 // For Tier3 languages, require a minimum number of bytes to be first-place lang
917 static const int kGoodFirstT3MinBytes = 24; // <this => no first
918
919 // Move bytes for unreliable langs to another lang or UNKNOWN
920 // doc_tote is sorted, so cannot Add
921 //
922 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
923 // merge both into CHINESE.
924 //
925 //dsites 2009.03.19
926 // we also want to remove Tier3 languages as the first lang if there is very
927 // little text like ej1 ej2 ej3 ej4
928 // maybe fold this back in earlier
929 //
RemoveUnreliableLanguages(DocTote * doc_tote,bool FLAGS_cld2_html,bool FLAGS_cld2_quiet)930 void RemoveUnreliableLanguages(DocTote* doc_tote,
931 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
932 // Prepass to merge some low-reliablility languages
933 // TODO: this shouldn't really reach in to the internal structure of doc_tote
934 int total_bytes = 0;
935 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
936 int plang = doc_tote->Key(sub);
937 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot
938
939 Language lang = static_cast<Language>(plang);
940 int bytes = doc_tote->Value(sub);
941 int reli = doc_tote->Reliability(sub);
942 if (bytes == 0) {continue;} // Zero bytes
943 total_bytes += bytes;
944
945 // Reliable percent = stored reliable score over stored bytecount
946 int reliable_percent = reli / bytes;
947 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
948
949 // This language is too unreliable to keep, but we might merge it.
950 Language altlang = UNKNOWN_LANGUAGE;
951 if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}
952 if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
953
954 // Look for alternative in doc_tote
955 int altsub = doc_tote->Find(altlang);
956 if (altsub < 0) {continue;} // No alternative text
957
958 int bytes2 = doc_tote->Value(altsub);
959 int reli2 = doc_tote->Reliability(altsub);
960 if (bytes2 == 0) {continue;} // Zero bytes
961
962 // Reliable percent is stored reliable score over stored bytecount
963 int reliable_percent2 = reli2 / bytes2;
964
965 // Merge one language into the other. Break ties toward lower lang #
966 int tosub = altsub;
967 int fromsub = sub;
968 bool into_lang = false;
969 if ((reliable_percent2 < reliable_percent) ||
970 ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
971 tosub = sub;
972 fromsub = altsub;
973 into_lang = true;
974 }
975
976 // Make sure merged reliability doesn't drop and is enough to avoid delete
977 int newpercent = maxint(reliable_percent, reliable_percent2);
978 newpercent = maxint(newpercent, kMinReliableKeepPercent);
979 int newbytes = bytes + bytes2;
980 int newreli = newpercent * newbytes;
981
982 doc_tote->SetKey(fromsub, DocTote::kUnusedKey);
983 doc_tote->SetScore(fromsub, 0);
984 doc_tote->SetReliability(fromsub, 0);
985 doc_tote->SetScore(tosub, newbytes);
986 doc_tote->SetReliability(tosub, newreli);
987
988 // Show fate of unreliable languages if at least 10 bytes
989 if (FLAGS_cld2_html && (newbytes >= 10) &&
990 !FLAGS_cld2_quiet) {
991 if (into_lang) {
992 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
993 LanguageCode(altlang), reliable_percent2, bytes2,
994 LanguageCode(lang));
995 } else {
996 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
997 LanguageCode(lang), reliable_percent, bytes,
998 LanguageCode(altlang));
999 }
1000 }
1001 }
1002
1003
1004 // Pass to delete any remaining unreliable languages
1005 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1006 int plang = doc_tote->Key(sub);
1007 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot
1008
1009 Language lang = static_cast<Language>(plang);
1010 int bytes = doc_tote->Value(sub);
1011 int reli = doc_tote->Reliability(sub);
1012 if (bytes == 0) {continue;} // Zero bytes
1013
1014 // Reliable percent is stored as reliable score over stored bytecount
1015 int reliable_percent = reli / bytes;
1016 if (reliable_percent >= kMinReliableKeepPercent) { // Keeper?
1017 continue; // yes
1018 }
1019
1020 // Delete unreliable entry
1021 doc_tote->SetKey(sub, DocTote::kUnusedKey);
1022 doc_tote->SetScore(sub, 0);
1023 doc_tote->SetReliability(sub, 0);
1024
1025 // Show fate of unreliable languages if at least 10 bytes
1026 if (FLAGS_cld2_html && (bytes >= 10) &&
1027 !FLAGS_cld2_quiet) {
1028 fprintf(stderr, "{Unreli %s.%dR,%dB} ",
1029 LanguageCode(lang), reliable_percent, bytes);
1030 }
1031 }
1032
1033 ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}
1034 }
1035
1036
1037 // Move all the text bytes from lower byte-count to higher one
MoveLang1ToLang2(Language lang1,Language lang2,int lang1_sub,int lang2_sub,DocTote * doc_tote,ResultChunkVector * resultchunkvector)1038 void MoveLang1ToLang2(Language lang1, Language lang2,
1039 int lang1_sub, int lang2_sub,
1040 DocTote* doc_tote,
1041 ResultChunkVector* resultchunkvector) {
1042 // In doc_tote, move all the bytes lang1 => lang2
1043 int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);
1044 doc_tote->SetValue(lang2_sub, sum);
1045 sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);
1046 doc_tote->SetScore(lang2_sub, sum);
1047 sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);
1048 doc_tote->SetReliability(lang2_sub, sum);
1049
1050 // Delete old entry
1051 doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);
1052 doc_tote->SetScore(lang1_sub, 0);
1053 doc_tote->SetReliability(lang1_sub, 0);
1054
1055 // In resultchunkvector, move all the bytes lang1 => lang2
1056 if (resultchunkvector == NULL) {return;}
1057
1058 int k = 0;
1059 uint16 prior_lang = UNKNOWN_LANGUAGE;
1060 for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
1061 ResultChunk* rc = &(*resultchunkvector)[i];
1062 if (rc->lang1 == lang1) {
1063 // Update entry[i] lang1 => lang2
1064 rc->lang1 = lang2;
1065 }
1066 // One change may produce two merges -- entry before and entry after
1067 if ((rc->lang1 == prior_lang) && (k > 0)) {
1068 // Merge with previous, deleting entry[i]
1069 ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];
1070 prior_rc->bytes += rc->bytes;
1071 // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);
1072 } else {
1073 // Keep entry[i]
1074 (*resultchunkvector)[k] = (*resultchunkvector)[i];
1075 // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);
1076 ++k;
1077 }
1078 prior_lang = rc->lang1;
1079 }
1080 resultchunkvector->resize(k);
1081 }
1082
1083
1084
1085 // Move less likely byte count to more likely for close pairs of languages
1086 // If given, also update resultchunkvector
RefineScoredClosePairs(DocTote * doc_tote,ResultChunkVector * resultchunkvector,bool FLAGS_cld2_html,bool FLAGS_cld2_quiet)1087 void RefineScoredClosePairs(DocTote* doc_tote,
1088 ResultChunkVector* resultchunkvector,
1089 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1090 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1091 int close_packedlang = doc_tote->Key(sub);
1092 int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));
1093 if (subscr == 0) {continue;}
1094
1095 // We have a close pair language -- if the other one is also scored and the
1096 // longword score differs enough, put all our eggs into one basket
1097
1098 // Nonzero longword score: Go look for the other of this pair
1099 for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1100 if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {
1101 // We have a matching pair
1102 int close_packedlang2 = doc_tote->Key(sub2);
1103
1104 // Move all the text bytes from lower byte-count to higher one
1105 int from_sub, to_sub;
1106 Language from_lang, to_lang;
1107 if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1108 from_sub = sub;
1109 to_sub = sub2;
1110 from_lang = static_cast<Language>(close_packedlang);
1111 to_lang = static_cast<Language>(close_packedlang2);
1112 } else {
1113 from_sub = sub2;
1114 to_sub = sub;
1115 from_lang = static_cast<Language>(close_packedlang2);
1116 to_lang = static_cast<Language>(close_packedlang);
1117 }
1118
1119 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1120 // Show fate of closepair language
1121 int val = doc_tote->Value(from_sub); // byte count
1122 int reli = doc_tote->Reliability(from_sub);
1123 int reliable_percent = reli / (val ? val : 1); // avoid zdiv
1124 fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",
1125 LanguageCode(from_lang),
1126 reliable_percent,
1127 doc_tote->Value(from_sub),
1128 LanguageCode(to_lang));
1129 }
1130 MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,
1131 doc_tote, resultchunkvector);
1132 break; // Exit inner for sub2 loop
1133 }
1134 } // End for sub2
1135 } // End for sub
1136 }
1137
1138
ApplyAllLanguageHints(Tote * chunk_tote,int tote_grams,uint8 * lang_hint_boost)1139 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,
1140 uint8* lang_hint_boost) {
1141 }
1142
1143
PrintHtmlEscapedText(FILE * f,const char * txt,int len)1144 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1145 string temp(txt, len);
1146 fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());
1147 }
1148
PrintLang(FILE * f,Tote * chunk_tote,Language cur_lang,bool cur_unreliable,Language prior_lang,bool prior_unreliable)1149 void PrintLang(FILE* f, Tote* chunk_tote,
1150 Language cur_lang, bool cur_unreliable,
1151 Language prior_lang, bool prior_unreliable) {
1152 if (cur_lang == prior_lang) {
1153 fprintf(f, "[]");
1154 } else {
1155 fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");
1156 }
1157 }
1158
1159
PrintTopLang(Language top_lang)1160 void PrintTopLang(Language top_lang) {
1161 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1162 fprintf(stderr, "[] ");
1163 } else {
1164 fprintf(stderr, "[%s] ", LanguageName(top_lang));
1165 prior_lang = top_lang;
1166 }
1167 }
1168
PrintTopLangSpeculative(Language top_lang)1169 void PrintTopLangSpeculative(Language top_lang) {
1170 fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1171 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1172 fprintf(stderr, "[] ");
1173 } else {
1174 fprintf(stderr, "[%s] ", LanguageName(top_lang));
1175 prior_lang = top_lang;
1176 }
1177 fprintf(stderr, "</span>\n");
1178 }
1179
PrintLangs(FILE * f,const Language * language3,const int * percent3,const int * text_bytes,const bool * is_reliable)1180 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1181 const int* text_bytes, const bool* is_reliable) {
1182 fprintf(f, "<br> Initial_Languages ");
1183 if (language3[0] != UNKNOWN_LANGUAGE) {
1184 fprintf(f, "%s%s(%d%%) ",
1185 LanguageName(language3[0]),
1186 *is_reliable ? "" : "*",
1187 percent3[0]);
1188 }
1189 if (language3[1] != UNKNOWN_LANGUAGE) {
1190 fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]);
1191 }
1192 if (language3[2] != UNKNOWN_LANGUAGE) {
1193 fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]);
1194 }
1195 fprintf(f, "%d bytes \n", *text_bytes);
1196
1197 fprintf(f, "<br>\n");
1198 }
1199
1200
1201 // Return internal probability score (sum) per 1024 bytes
GetNormalizedScore(Language lang,ULScript ulscript,int bytecount,int score)1202 double GetNormalizedScore(Language lang, ULScript ulscript,
1203 int bytecount, int score) {
1204 if (bytecount <= 0) {return 0.0;}
1205 return (score << 10) / bytecount;
1206 }
1207
1208 // Extract return values before fixups
ExtractLangEtc(DocTote * doc_tote,int total_text_bytes,int * reliable_percent3,Language * language3,int * percent3,double * normalized_score3,int * text_bytes,bool * is_reliable)1209 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,
1210 int* reliable_percent3, Language* language3, int* percent3,
1211 double* normalized_score3,
1212 int* text_bytes, bool* is_reliable) {
1213 reliable_percent3[0] = 0;
1214 reliable_percent3[1] = 0;
1215 reliable_percent3[2] = 0;
1216 language3[0] = UNKNOWN_LANGUAGE;
1217 language3[1] = UNKNOWN_LANGUAGE;
1218 language3[2] = UNKNOWN_LANGUAGE;
1219 percent3[0] = 0;
1220 percent3[1] = 0;
1221 percent3[2] = 0;
1222 normalized_score3[0] = 0.0;
1223 normalized_score3[1] = 0.0;
1224 normalized_score3[2] = 0.0;
1225
1226 *text_bytes = total_text_bytes;
1227 *is_reliable = false;
1228
1229 int bytecount1 = 0;
1230 int bytecount2 = 0;
1231 int bytecount3 = 0;
1232
1233 int lang1 = doc_tote->Key(0);
1234 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1235 // We have a top language
1236 language3[0] = static_cast<Language>(lang1);
1237 bytecount1 = doc_tote->Value(0);
1238 int reli1 = doc_tote->Reliability(0);
1239 reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
1240 normalized_score3[0] = GetNormalizedScore(language3[0],
1241 ULScript_Common,
1242 bytecount1,
1243 doc_tote->Score(0));
1244 }
1245
1246 int lang2 = doc_tote->Key(1);
1247 if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {
1248 language3[1] = static_cast<Language>(lang2);
1249 bytecount2 = doc_tote->Value(1);
1250 int reli2 = doc_tote->Reliability(1);
1251 reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
1252 normalized_score3[1] = GetNormalizedScore(language3[1],
1253 ULScript_Common,
1254 bytecount2,
1255 doc_tote->Score(1));
1256 }
1257
1258 int lang3 = doc_tote->Key(2);
1259 if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {
1260 language3[2] = static_cast<Language>(lang3);
1261 bytecount3 = doc_tote->Value(2);
1262 int reli3 = doc_tote->Reliability(2);
1263 reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
1264 normalized_score3[2] = GetNormalizedScore(language3[2],
1265 ULScript_Common,
1266 bytecount3,
1267 doc_tote->Score(2));
1268 }
1269
1270 // Increase total bytes to sum (top 3) if low for some reason
1271 int total_bytecount12 = bytecount1 + bytecount2;
1272 int total_bytecount123 = total_bytecount12 + bytecount3;
1273 if (total_text_bytes < total_bytecount123) {
1274 total_text_bytes = total_bytecount123;
1275 *text_bytes = total_text_bytes;
1276 }
1277
1278 // Sum minus previous % gives better roundoff behavior than bytecount/total
1279 int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv
1280 percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1281 percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1282 percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1283 percent3[2] -= percent3[1];
1284 percent3[1] -= percent3[0];
1285
1286 // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1287 // Fix this explicitly
1288 if (percent3[1] < percent3[2]) {
1289 ++percent3[1];
1290 --percent3[2];
1291 }
1292 if (percent3[0] < percent3[1]) {
1293 ++percent3[0];
1294 --percent3[1];
1295 }
1296
1297 *text_bytes = total_text_bytes;
1298
1299 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1300 // We have a top language
1301 // Its reliability is overall result reliability
1302 int bytecount = doc_tote->Value(0);
1303 int reli = doc_tote->Reliability(0);
1304 int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
1305 *is_reliable = (reliable_percent >= kMinReliableKeepPercent);
1306 } else {
1307 // No top language at all. This can happen with zero text or 100% Klingon
1308 // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.
1309 *is_reliable = false;
1310 }
1311
1312 // If ignore percent is too large, set unreliable.
1313 int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1314 if ((ignore_percent > kIgnoreMaxPercent)) {
1315 *is_reliable = false;
1316 }
1317 }
1318
IsFIGS(Language lang)1319 bool IsFIGS(Language lang) {
1320 if (lang == FRENCH) {return true;}
1321 if (lang == ITALIAN) {return true;}
1322 if (lang == GERMAN) {return true;}
1323 if (lang == SPANISH) {return true;}
1324 return false;
1325 }
1326
IsEFIGS(Language lang)1327 bool IsEFIGS(Language lang) {
1328 if (lang == ENGLISH) {return true;}
1329 if (lang == FRENCH) {return true;}
1330 if (lang == ITALIAN) {return true;}
1331 if (lang == GERMAN) {return true;}
1332 if (lang == SPANISH) {return true;}
1333 return false;
1334 }
1335
1336 // For Tier3 languages, require more bytes of text to override
1337 // the first-place language
1338 static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
1339 static const int kGoodSecondT3MinBytes = 128; // <this => no second
1340
1341 // Calculate a single summary language for the document, and its reliability.
1342 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1343 // This is the heart of matching human-rater perception.
1344 // reliable_percent3[] is currently unused
1345 //
1346 // Do not return Tier3 second language unless there are at least 128 bytes
CalcSummaryLang(DocTote * doc_tote,int total_text_bytes,const int * reliable_percent3,const Language * language3,const int * percent3,Language * summary_lang,bool * is_reliable,bool FLAGS_cld2_html,bool FLAGS_cld2_quiet)1347 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
1348 const int* reliable_percent3,
1349 const Language* language3,
1350 const int* percent3,
1351 Language* summary_lang, bool* is_reliable,
1352 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1353 // Vector of active languages; changes if we delete some
1354 int slot_count = 3;
1355 int active_slot[3] = {0, 1, 2};
1356
1357 int ignore_percent = 0;
1358 int return_percent = percent3[0]; // Default to top lang
1359 *summary_lang = language3[0];
1360 *is_reliable = true;
1361 if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
1362
1363 // If any of top 3 is IGNORE, remove it and increment ignore_percent
1364 for (int i = 0; i < 3; ++i) {
1365 if (language3[i] == TG_UNKNOWN_LANGUAGE) {
1366 ignore_percent += percent3[i];
1367 // Move the rest up, levaing input vectors unchanged
1368 for (int j=i+1; j < 3; ++j) {
1369 active_slot[j - 1] = active_slot[j];
1370 }
1371 -- slot_count;
1372 // Logically remove Ignore from percentage-text calculation
1373 // (extra 1 in 101 avoids zdiv, biases slightly small)
1374 return_percent = (percent3[0] * 100) / (101 - ignore_percent);
1375 *summary_lang = language3[active_slot[0]];
1376 if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
1377 }
1378 }
1379
1380
1381 // If English and X, where X (not UNK) is big enough,
1382 // assume the English is boilerplate and return X.
1383 // Logically remove English from percentage-text calculation
1384 int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
1385 // Require more bytes of text for Tier3 languages
1386 int minbytesneeded = kGoodSecondT1T2MinBytes;
1387 int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);
1388
1389 if ((language3[active_slot[0]] == ENGLISH) &&
1390 (language3[active_slot[1]] != ENGLISH) &&
1391 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1392 (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
1393 (second_bytes >= minbytesneeded)) {
1394 ignore_percent += percent3[active_slot[0]];
1395 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1396 *summary_lang = language3[active_slot[1]];
1397 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1398
1399 // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
1400 // assume the FIGS is boilerplate and return X.
1401 // Logically remove FIGS from percentage-text calculation
1402 } else if (IsFIGS(language3[active_slot[0]]) &&
1403 !IsEFIGS(language3[active_slot[1]]) &&
1404 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1405 (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
1406 (second_bytes >= minbytesneeded)) {
1407 ignore_percent += percent3[active_slot[0]];
1408 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1409 *summary_lang = language3[active_slot[1]];
1410 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1411
1412 // Else we are returning the first language, but want to improve its
1413 // return_percent if the second language should be ignored
1414 } else if ((language3[active_slot[1]] == ENGLISH) &&
1415 (language3[active_slot[0]] != ENGLISH)) {
1416 ignore_percent += percent3[active_slot[1]];
1417 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1418 } else if (IsFIGS(language3[active_slot[1]]) &&
1419 !IsEFIGS(language3[active_slot[0]])) {
1420 ignore_percent += percent3[active_slot[1]];
1421 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1422 }
1423
1424 // If return percent is too small (too many languages), return UNKNOWN
1425 if ((return_percent < kGoodFirstMinPercent)) {
1426 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1427 fprintf(stderr, "{Unreli %s %d%% percent too small} ",
1428 LanguageCode(*summary_lang), return_percent);
1429 }
1430 *summary_lang = UNKNOWN_LANGUAGE;
1431 *is_reliable = false;
1432 }
1433
1434 // If return percent is small, return language but set unreliable.
1435 if ((return_percent < kGoodFirstReliableMinPercent)) {
1436 *is_reliable = false;
1437 }
1438
1439 // If ignore percent is too large, set unreliable.
1440 ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1441 if ((ignore_percent > kIgnoreMaxPercent)) {
1442 *is_reliable = false;
1443 }
1444
1445 // If we removed all the active languages, return UNKNOWN
1446 if (slot_count == 0) {
1447 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1448 fprintf(stderr, "{Unreli %s no languages left} ",
1449 LanguageCode(*summary_lang));
1450 }
1451 *summary_lang = UNKNOWN_LANGUAGE;
1452 *is_reliable = false;
1453 }
1454 }
1455
AddLangPriorBoost(Language lang,uint32 langprob,ScoringContext * scoringcontext)1456 void AddLangPriorBoost(Language lang, uint32 langprob,
1457 ScoringContext* scoringcontext) {
1458 // This is called 0..n times with language hints
1459 // but we don't know the script -- so boost either or both Latn, Othr.
1460
1461 if (IsLatnLanguage(lang)) {
1462 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
1463 int n = langprior_boost->n;
1464 langprior_boost->langprob[n] = langprob;
1465 langprior_boost->n = langprior_boost->wrap(n + 1);
1466 }
1467
1468 if (IsOthrLanguage(lang)) {
1469 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
1470 int n = langprior_boost->n;
1471 langprior_boost->langprob[n] = langprob;
1472 langprior_boost->n = langprior_boost->wrap(n + 1);
1473 }
1474
1475 }
1476
AddOneWhack(Language whacker_lang,Language whackee_lang,ScoringContext * scoringcontext)1477 void AddOneWhack(Language whacker_lang, Language whackee_lang,
1478 ScoringContext* scoringcontext) {
1479 uint32 langprob = MakeLangProb(whackee_lang, 1);
1480 // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
1481 if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
1482 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
1483 int n = langprior_whack->n;
1484 langprior_whack->langprob[n] = langprob;
1485 langprior_whack->n = langprior_whack->wrap(n + 1);
1486 }
1487 if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
1488 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
1489 int n = langprior_whack->n;
1490 langprior_whack->langprob[n] = langprob;
1491 langprior_whack->n = langprior_whack->wrap(n + 1);
1492 }
1493 }
1494
AddCloseLangWhack(Language lang,ScoringContext * scoringcontext)1495 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
1496 // We do not in general want zh-Hans and zh-Hant to be close pairs,
1497 // but we do here.
1498 if (lang == CLD2::CHINESE) {
1499 AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
1500 return;
1501 }
1502 if (lang == CLD2::CHINESE_T) {
1503 AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
1504 return;
1505 }
1506
1507 int base_lang_set = LanguageCloseSet(lang);
1508 if (base_lang_set == 0) {return;}
1509 // TODO: add an explicit list of each set to avoid this 512-times loop
1510 for (int i = 0; i < kLanguageToPLangSize; ++i) {
1511 Language lang2 = static_cast<Language>(i);
1512 if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
1513 AddOneWhack(lang, lang2, scoringcontext);
1514 }
1515 }
1516 }
1517
1518
ApplyHints(const char * buffer,int buffer_length,bool is_plain_text,const CLDHints * cld_hints,ScoringContext * scoringcontext)1519 void ApplyHints(const char* buffer,
1520 int buffer_length,
1521 bool is_plain_text,
1522 const CLDHints* cld_hints,
1523 ScoringContext* scoringcontext) {
1524 CLDLangPriors lang_priors;
1525 InitCLDLangPriors(&lang_priors);
1526
1527 // We now use lang= tags.
1528 // Last look, circa 2008 found only 15% of web pages with lang= tags and
1529 // many of those were wrong. Now (July 2011), we find 44% of web pages have
1530 // lang= tags, and most of them are correct. So we now give them substantial
1531 // weight in each chunk scored.
1532 if (!is_plain_text) {
1533 // Get any contained language tags in first n KB
1534 int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;
1535 string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,
1536 max_scan_bytes);
1537 SetCLDLangTagsHint(lang_tags, &lang_priors);
1538 if (scoringcontext->flags_cld2_html) {
1539 if (!lang_tags.empty()) {
1540 fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",
1541 lang_tags.c_str());
1542 }
1543 }
1544 }
1545
1546 if (cld_hints != NULL) {
1547 if ((cld_hints->content_language_hint != NULL) &&
1548 (cld_hints->content_language_hint[0] != '\0')) {
1549 SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);
1550 }
1551
1552 // Input is from GetTLD(), already lowercased
1553 if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {
1554 SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);
1555 }
1556
1557 if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {
1558 Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);
1559 SetCLDEncodingHint(enc, &lang_priors);
1560 }
1561
1562 if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {
1563 SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);
1564 }
1565 }
1566
1567 // Keep no more than four different languages with hints
1568 TrimCLDLangPriors(4, &lang_priors);
1569
1570 if (scoringcontext->flags_cld2_html) {
1571 string print_temp = DumpCLDLangPriors(&lang_priors);
1572 if (!print_temp.empty()) {
1573 fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",
1574 print_temp.c_str());
1575 }
1576 }
1577
1578 // Put boosts into ScoringContext
1579 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1580 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1581 int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1582 if (qprob > 0) {
1583 uint32 langprob = MakeLangProb(lang, qprob);
1584 AddLangPriorBoost(lang, langprob, scoringcontext);
1585 }
1586 }
1587
1588 // Put whacks into scoring context
1589 // We do not in general want zh-Hans and zh-Hant to be close pairs,
1590 // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
1591 std::vector<int> close_set_count(kCloseSetSize + 1, 0);
1592
1593 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1594 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1595 ++close_set_count[LanguageCloseSet(lang)];
1596 if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
1597 if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
1598 }
1599
1600 // If a boost language is in a close set, force suppressing the others in
1601 // that set, if exactly one of the set is present
1602 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1603 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1604 int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1605 if (qprob > 0) {
1606 int close_set = LanguageCloseSet(lang);
1607 if ((close_set > 0) && (close_set_count[close_set] == 1)) {
1608 AddCloseLangWhack(lang, scoringcontext);
1609 }
1610 if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
1611 (close_set_count[kCloseSetSize] == 1)) {
1612 AddCloseLangWhack(lang, scoringcontext);
1613 }
1614 }
1615 }
1616
1617
1618
1619
1620
1621
1622 }
1623
1624
1625
1626 // Results language3/percent3/text_bytes must be exactly three items
DetectLanguageSummaryV2(const char * buffer,int buffer_length,bool is_plain_text,const CLDHints * cld_hints,bool allow_extended_lang,int flags,Language plus_one,Language * language3,int * percent3,double * normalized_score3,ResultChunkVector * resultchunkvector,int * text_bytes,bool * is_reliable)1627 Language DetectLanguageSummaryV2(
1628 const char* buffer,
1629 int buffer_length,
1630 bool is_plain_text,
1631 const CLDHints* cld_hints,
1632 bool allow_extended_lang,
1633 int flags,
1634 Language plus_one,
1635 Language* language3,
1636 int* percent3,
1637 double* normalized_score3,
1638 ResultChunkVector* resultchunkvector,
1639 int* text_bytes,
1640 bool* is_reliable) {
1641 language3[0] = UNKNOWN_LANGUAGE;
1642 language3[1] = UNKNOWN_LANGUAGE;
1643 language3[2] = UNKNOWN_LANGUAGE;
1644 percent3[0] = 0;
1645 percent3[1] = 0;
1646 percent3[2] = 0;
1647 normalized_score3[0] = 0.0;
1648 normalized_score3[1] = 0.0;
1649 normalized_score3[2] = 0.0;
1650 if (resultchunkvector != NULL) {
1651 resultchunkvector->clear();
1652 }
1653 *text_bytes = 0;
1654 *is_reliable = false;
1655
1656 if ((flags & kCLDFlagEcho) != 0) {
1657 string temp(buffer, buffer_length);
1658 if ((flags & kCLDFlagHtml) != 0) {
1659 fprintf(stderr, "CLD2[%d] '%s'<br>\n",
1660 buffer_length, GetHtmlEscapedText(temp).c_str());
1661 } else {
1662 fprintf(stderr, "CLD2[%d] '%s'\n",
1663 buffer_length, GetPlainEscapedText(temp).c_str());
1664 }
1665 }
1666
1667 #ifdef CLD2_DYNAMIC_MODE
1668 // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
1669 // hasn't been loaded yet. This is the only sane thing we can do, as there
1670 // are no scoring tables to consult.
1671 bool dataLoaded = isDataLoaded();
1672 if ((flags & kCLDFlagVerbose) != 0) {
1673 fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
1674 }
1675 if (!dataLoaded) {
1676 return UNKNOWN_LANGUAGE;
1677 }
1678 #endif
1679
1680 // Exit now if no text
1681 if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
1682 if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
1683
1684 // Document totals
1685 DocTote doc_tote; // Reliability = 0..100
1686
1687 // ScoringContext carries state across scriptspans
1688 ScoringContext scoringcontext;
1689 scoringcontext.debug_file = stderr;
1690 scoringcontext.flags_cld2_score_as_quads =
1691 ((flags & kCLDFlagScoreAsQuads) != 0);
1692 scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);
1693 scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);
1694 scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);
1695 scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;
1696 scoringcontext.ulscript = ULScript_Common;
1697 scoringcontext.scoringtables = &kScoringtables;
1698 scoringcontext.scanner = NULL;
1699 scoringcontext.init(); // Clear the internal memory arrays
1700
1701 // Now thread safe.
1702 bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);
1703 bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);
1704
1705 ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);
1706
1707 // Four individual script totals, Latin, Han, other2, other3
1708 int next_other_tote = 2;
1709 int tote_num = 0;
1710
1711 // Four totes for up to four different scripts pending at once
1712 Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
1713 bool tote_seen[4] = {false, false, false, false};
1714 int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
1715 ULScript tote_script[4] =
1716 {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};
1717
1718 // Loop through text spans in a single script
1719 ScriptScanner ss(buffer, buffer_length, is_plain_text);
1720 LangSpan scriptspan;
1721
1722 scoringcontext.scanner = &ss;
1723
1724 scriptspan.text = NULL;
1725 scriptspan.text_bytes = 0;
1726 scriptspan.offset = 0;
1727 scriptspan.ulscript = ULScript_Common;
1728 scriptspan.lang = UNKNOWN_LANGUAGE;
1729
1730 int total_text_bytes = 0;
1731 int textlimit = FLAGS_cld_textlimit << 10; // in KB
1732 if (textlimit == 0) {textlimit = 0x7fffffff;}
1733
1734 int advance_by = 2; // Advance 2 bytes
1735 int advance_limit = textlimit >> 3; // For first 1/8 of max document
1736
1737 int initial_word_span = kDefaultWordSpan;
1738 if (FLAGS_cld_forcewords) {
1739 initial_word_span = kReallyBigWordSpan;
1740 }
1741
1742 // Pick up chunk sizes
1743 // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
1744 // Sanity check -- force into a reasonable range
1745 int chunksizequads = FLAGS_cld_smoothwidth;
1746 chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),
1747 kMaxChunkSizeQuads);
1748 int chunksizeunis = (chunksizequads * 5) >> 1;
1749
1750 // Varying short-span limit doesn't work well -- skips too much beyond 20KB
1751 // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
1752 int spantooshortlimit = kShortSpanThresh;
1753
1754 // For debugging only. Not thread-safe
1755 prior_lang = UNKNOWN_LANGUAGE;
1756 prior_unreliable = false;
1757
1758 // Allocate full-document prediction table for finding repeating words
1759 int hash = 0;
1760 int* predict_tbl = new int[kPredictionTableSize];
1761 if (FlagRepeats(flags)) {
1762 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1763 }
1764
1765
1766
1767 // Loop through scriptspans accumulating number of text bytes in each language
1768 while (ss.GetOneScriptSpanLower(&scriptspan)) {
1769 ULScript ulscript = scriptspan.ulscript;
1770
1771 // Squeeze out big chunks of text span if asked to
1772 if (FlagSqueeze(flags)) {
1773 // Remove repetitive or mostly-spaces chunks
1774 int newlen;
1775 int chunksize = 0; // Use the default
1776 if (resultchunkvector != NULL) {
1777 newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,
1778 scriptspan.text_bytes,
1779 chunksize);
1780 } else {
1781 newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
1782 chunksize);
1783 }
1784 scriptspan.text_bytes = newlen;
1785 } else {
1786 // Check now and then to see if we should be squeezing
1787 if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&
1788 !FlagFinish(flags)) {
1789 // fprintf(stderr, "CheapSqueezeTriggerTest, "
1790 // "first %d bytes of %d (>%d/2)<br>\n",
1791 // kCheapSqueezeTestLen,
1792 // scriptspan.text_bytes,
1793 // kCheapSqueezeTestThresh);
1794
1795 if (CheapSqueezeTriggerTest(scriptspan.text,
1796 scriptspan.text_bytes,
1797 kCheapSqueezeTestLen)) {
1798 // Recursive call with big-chunk squeezing set
1799 if (FLAGS_cld2_html || FLAGS_dbgscore) {
1800 fprintf(stderr,
1801 "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
1802 total_text_bytes);
1803 }
1804 // Deallocate full-document prediction table
1805 delete[] predict_tbl;
1806
1807 return DetectLanguageSummaryV2(
1808 buffer,
1809 buffer_length,
1810 is_plain_text,
1811 cld_hints,
1812 allow_extended_lang,
1813 flags | kCLDFlagSqueeze,
1814 plus_one,
1815 language3,
1816 percent3,
1817 normalized_score3,
1818 resultchunkvector,
1819 text_bytes,
1820 is_reliable);
1821 }
1822 }
1823 }
1824
1825 // Remove repetitive words if asked to
1826 if (FlagRepeats(flags)) {
1827 // Remove repetitive words
1828 int newlen;
1829 if (resultchunkvector != NULL) {
1830 newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,
1831 scriptspan.text_bytes,
1832 &hash, predict_tbl);
1833 } else {
1834 newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
1835 &hash, predict_tbl);
1836 }
1837 scriptspan.text_bytes = newlen;
1838 }
1839
1840 // Scoring depends on scriptspan buffer ALWAYS having
1841 // leading space and off-the-end space space space NUL,
1842 // DCHECK(scriptspan.text[0] == ' ');
1843 // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');
1844 // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');
1845 // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');
1846 // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');
1847
1848 // The real scoring
1849 // Accumulate directly into the document total, or accmulate in one of four
1850 // chunk totals. The purpose of the multiple chunk totals is to piece
1851 // together short choppy pieces of text in alternating scripts. One total is
1852 // dedicated to Latin text, one to Han text, and the other two are dynamicly
1853 // assigned.
1854
1855 scoringcontext.ulscript = scriptspan.ulscript;
1856 // FLAGS_cld2_html = scoringcontext.flags_cld2_html;
1857
1858 ScoreOneScriptSpan(scriptspan,
1859 &scoringcontext,
1860 &doc_tote,
1861 resultchunkvector);
1862
1863 total_text_bytes += scriptspan.text_bytes;
1864 } // End while (ss.GetOneScriptSpanLower())
1865
1866 // Deallocate full-document prediction table
1867 delete[] predict_tbl;
1868
1869 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1870 // If no forced <cr>, put one in front of dump
1871 if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}
1872 doc_tote.Dump(stderr);
1873 }
1874
1875
1876 // If extended langauges are disallowed, remove them here
1877 if (!allow_extended_lang) {
1878 RemoveExtendedLanguages(&doc_tote);
1879 }
1880
1881 // Force close pairs to one or the other
1882 // If given, also update resultchunkvector
1883 RefineScoredClosePairs(&doc_tote, resultchunkvector,
1884 FLAGS_cld2_html, FLAGS_cld2_quiet);
1885
1886
1887 // Calculate return results
1888 // Find top three byte counts in tote heap
1889 int reliable_percent3[3];
1890
1891 // Cannot use Add, etc. after sorting
1892 doc_tote.Sort(3);
1893
1894 ExtractLangEtc(&doc_tote, total_text_bytes,
1895 reliable_percent3, language3, percent3, normalized_score3,
1896 text_bytes, is_reliable);
1897
1898 bool have_good_answer = false;
1899 if (FlagFinish(flags)) {
1900 // Force a result
1901 have_good_answer = true;
1902 } else if (total_text_bytes <= kShortTextThresh) {
1903 // Don't recurse on short text -- we already did word scores
1904 have_good_answer = true;
1905 } else if (*is_reliable &&
1906 (percent3[0] >= kGoodLang1Percent)) {
1907 have_good_answer = true;
1908 } else if (*is_reliable &&
1909 ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
1910 have_good_answer = true;
1911 }
1912
1913
1914 if (have_good_answer) {
1915 // This is the real, non-recursive return
1916
1917 // Move bytes for unreliable langs to another lang or UNKNOWN
1918 RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
1919
1920 // Redo the result extraction after the removal above
1921 doc_tote.Sort(3);
1922 ExtractLangEtc(&doc_tote, total_text_bytes,
1923 reliable_percent3, language3, percent3, normalized_score3,
1924 text_bytes, is_reliable);
1925
1926
1927
1928 Language summary_lang;
1929 CalcSummaryLang(&doc_tote, total_text_bytes,
1930 reliable_percent3, language3, percent3,
1931 &summary_lang, is_reliable,
1932 FLAGS_cld2_html, FLAGS_cld2_quiet);
1933
1934 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1935 for (int i = 0; i < 3; ++i) {
1936 if (language3[i] != UNKNOWN_LANGUAGE) {
1937 fprintf(stderr, "%s.%dR(%d%%) ",
1938 LanguageCode(language3[i]),
1939 reliable_percent3[i],
1940 percent3[i]);
1941 }
1942 }
1943
1944 fprintf(stderr, "%d bytes ", total_text_bytes);
1945 fprintf(stderr, "= %s%c ",
1946 LanguageName(summary_lang), *is_reliable ? ' ' : '*');
1947 fprintf(stderr, "<br><br>\n");
1948 }
1949
1950 // Slightly condensed if quiet
1951 if (FLAGS_cld2_html && FLAGS_cld2_quiet) {
1952 fprintf(stderr, " ");
1953 for (int i = 0; i < 3; ++i) {
1954 if (language3[i] != UNKNOWN_LANGUAGE) {
1955 fprintf(stderr, " %s %d%% ",
1956 LanguageCode(language3[i]),
1957 percent3[i]);
1958 }
1959 }
1960 fprintf(stderr, "= %s%c ",
1961 LanguageName(summary_lang), *is_reliable ? ' ' : '*');
1962 fprintf(stderr, "<br>\n");
1963 }
1964
1965 return summary_lang;
1966 }
1967
1968 // Not a good answer -- do recursive call to refine
1969 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1970 // This is what we hope to improve on in the recursive call, if any
1971 PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
1972 }
1973
1974 // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
1975 // For this purpose, we treate "Ignore" as top40
1976 Language new_plus_one = UNKNOWN_LANGUAGE;
1977
1978 if (total_text_bytes < kShortTextThresh) {
1979 // Short text: Recursive call with top40 and short set
1980 if (FLAGS_cld2_html || FLAGS_dbgscore) {
1981 fprintf(stderr, " ---text_bytes[%d] "
1982 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
1983 total_text_bytes);
1984 }
1985 return DetectLanguageSummaryV2(
1986 buffer,
1987 buffer_length,
1988 is_plain_text,
1989 cld_hints,
1990 allow_extended_lang,
1991 flags | kCLDFlagTop40 | kCLDFlagRepeats |
1992 kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
1993 new_plus_one,
1994 language3,
1995 percent3,
1996 normalized_score3,
1997 resultchunkvector,
1998 text_bytes,
1999 is_reliable);
2000 }
2001
2002 // Longer text: Recursive call with top40 set
2003 if (FLAGS_cld2_html || FLAGS_dbgscore) {
2004 fprintf(stderr,
2005 " ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2006 total_text_bytes);
2007 }
2008 return DetectLanguageSummaryV2(
2009 buffer,
2010 buffer_length,
2011 is_plain_text,
2012 cld_hints,
2013 allow_extended_lang,
2014 flags | kCLDFlagTop40 | kCLDFlagRepeats |
2015 kCLDFlagFinish,
2016 new_plus_one,
2017 language3,
2018 percent3,
2019 normalized_score3,
2020 resultchunkvector,
2021 text_bytes,
2022 is_reliable);
2023 }
2024
2025
2026 // For debugging and wrappers. Not thread safe.
2027 static char temp_detectlanguageversion[32];
2028
2029 // Return version text string
2030 // String is "code_version - data_build_date"
DetectLanguageVersion()2031 const char* DetectLanguageVersion() {
2032 if (kScoringtables.quadgram_obj == NULL) {return "";}
2033 sprintf(temp_detectlanguageversion,
2034 "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);
2035 return temp_detectlanguageversion;
2036 }
2037
2038
2039 } // End namespace CLD2
2040